In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling
import cufflinks as cf
import plotly.offline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
import os.path as osp

cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
pd.set_option('display.max_columns', 38)

DATA_DIR = "/home/gangda/workspace/ds-richter/data"
split_set = "1_2"

X = pd.read_csv(osp.join(DATA_DIR, 'train_{}_values.csv'.format(split_set)), index_col='building_id')
y = pd.read_csv(osp.join(DATA_DIR, 'train_{}_labels.csv'.format(split_set)), index_col='building_id')
X_test = pd.read_csv(DATA_DIR + '/test_values.csv', index_col='building_id')

cat_cols=['geo_level_1_id','geo_level_2_id','geo_level_3_id','land_surface_condition', 'foundation_type', 'roof_type',
    #    'age','area_percentage','height_percentage', 'count_families','count_floors_pre_eq',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other']

In [27]:
# best_params = {
#     "random_seed": 42,
#     "use_best_model": True,
#     "eval_metric": "F1",
#     "bagging_temperature": 0.008240891229822092,
#     "boosting_type": "Ordered",
#     "border_count": 11,
#     "colsample_bylevel": 0.30528252170397946,
#     "depth": 9,
#     "l2_leaf_reg": 7,
#     "learning_rate": 0.065,
#     "min_data_in_leaf": 26,
#     "n_estimators": 7537,
#     "od_type": "IncToDec",
#     "random_strength": 0.00043391429996536275
# }

best_params_1_2 = {
    "random_seed": 42,
    "use_best_model": True,
    "eval_metric": "F1",
    'n_estimators': 5000,
    'depth': 9,
    'learning_rate': 0.06,
    'border_count': 17,
    'random_strength': 6.504831464247402e-06
}

best_params_2_3 = {
    "random_seed": 42,
    "use_best_model": True,
    "eval_metric": "F1",
    'n_estimators': 5000,
    'depth': 8,
    'learning_rate': 0.07,
    'border_count': 15,
    'random_strength': 0.0014150278090864392
}

if split_set == '1_2':
    best_params = best_params_1_2
else:
    best_params = best_params_2_3

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
best_models = []
val_acc = 0
for train_index, val_index in kf.split(X):
    X_train = X.iloc[train_index, :]
    y_train = y.iloc[train_index, :]

    X_val = X.iloc[val_index, :]
    y_val = y.iloc[val_index, :]

    model = CatBoostClassifier(cat_features=cat_cols, **best_params, silent=False, thread_count=-1)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=1000)

    val_acc += f1_score(y_val, model.predict(X_val), average='micro')
    best_models.append(model)

print(f'val_acc: {val_acc/len(best_models): .5f}')

0:	learn: 0.9311274	test: 0.9303526	best: 0.9303526 (0)	total: 298ms	remaining: 24m 49s
1:	learn: 0.9313068	test: 0.9305356	best: 0.9305356 (1)	total: 456ms	remaining: 18m 58s
2:	learn: 0.9313238	test: 0.9313374	best: 0.9313374 (2)	total: 750ms	remaining: 20m 49s
3:	learn: 0.9312186	test: 0.9311147	best: 0.9313374 (2)	total: 885ms	remaining: 18m 25s
4:	learn: 0.9323317	test: 0.9323069	best: 0.9323069 (4)	total: 1.16s	remaining: 19m 23s
5:	learn: 0.9334517	test: 0.9331602	best: 0.9331602 (5)	total: 1.44s	remaining: 20m
6:	learn: 0.9336339	test: 0.9335000	best: 0.9335000 (6)	total: 1.71s	remaining: 20m 19s
7:	learn: 0.9340739	test: 0.9340973	best: 0.9340973 (7)	total: 1.96s	remaining: 20m 24s
8:	learn: 0.9342345	test: 0.9341971	best: 0.9341971 (8)	total: 2.17s	remaining: 20m 5s
9:	learn: 0.9345722	test: 0.9346438	best: 0.9346438 (9)	total: 2.38s	remaining: 19m 49s
10:	learn: 0.9347261	test: 0.9347589	best: 0.9347589 (10)	total: 2.53s	remaining: 19m 8s
11:	learn: 0.9348251	test: 0.9351641

In [32]:
print(f'val_acc: {val_acc/len(best_models): .5f}')

val_acc:  0.90355


In [None]:
val_probs = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_val = X.iloc[val_index, :]
    val_probs.append(best_models[i].predict_proba(X_val))

val_probs = np.concatenate(val_probs, axis=0)
val_probs

In [None]:
test_probs = None
for best_model in best_models:
    if test_probs is None:
        test_probs = best_model.predict_proba(X_test)
    else:
        test_probs += best_model.predict_proba(X_test)

test_probs /= len(best_models)
test_probs

In [None]:
np.save(DATA_DIR + '/intermediate/catboost_{}fold_v2_{}_train.npy'.format(kf.n_splits, split_set), val_probs)
np.save(DATA_DIR + '/intermediate/catboost_{}fold_v2_{}_test.npy'.format(kf.n_splits, split_set), test_probs)

In [None]:
for i, model in enumerate(best_models):
    model.save_model("models/intermediate/catboost_{}fold{}_{}.cbm".format(kf.n_splits,i,split_set))