In [56]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling
import cufflinks as cf
import plotly.offline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import f1_score

cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
pd.set_option('display.max_columns', 38)

DATA_DIR = "/home/gangda/workspace/ds-richter/data"

X = pd.read_csv(DATA_DIR + '/train_values.csv', index_col='building_id')
y = pd.read_csv(DATA_DIR + '/train_labels.csv', index_col='building_id')
X_test = pd.read_csv(DATA_DIR + '/test_values.csv', index_col='building_id')

cat_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'count_floors_pre_eq', 'has_superstructure_adobe_mud',
            'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
            'has_superstructure_cement_mortar_stone',
            'has_superstructure_mud_mortar_brick',
            'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
            'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
            'has_superstructure_rc_engineered', 'has_superstructure_other', 'has_secondary_use',
            'has_secondary_use_agriculture', 'has_secondary_use_hotel',
            'has_secondary_use_rental', 'has_secondary_use_institution',
            'has_secondary_use_school', 'has_secondary_use_industry',
            'has_secondary_use_health_post', 'has_secondary_use_gov_office',
            'has_secondary_use_use_police', 'has_secondary_use_other',
            'land_surface_condition_n', 'land_surface_condition_o',
            'land_surface_condition_t', 'foundation_type_h', 'foundation_type_i',
            'foundation_type_r', 'foundation_type_u', 'foundation_type_w',
            'roof_type_n', 'roof_type_q', 'roof_type_x', 'ground_floor_type_f',
            'ground_floor_type_m', 'ground_floor_type_v', 'ground_floor_type_x',
            'ground_floor_type_z', 'other_floor_type_j', 'other_floor_type_q',
            'other_floor_type_s', 'other_floor_type_x', 'position_j', 'position_o',
            'position_s', 'position_t', 'plan_configuration_a',
            'plan_configuration_c', 'plan_configuration_d', 'plan_configuration_f',
            'plan_configuration_m', 'plan_configuration_n', 'plan_configuration_o',
            'plan_configuration_q', 'plan_configuration_s', 'plan_configuration_u',
            'legal_ownership_status_a', 'legal_ownership_status_r',
            'legal_ownership_status_v', 'legal_ownership_status_w']

non_numeric_columns = ['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type',
                            'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']
X = pd.get_dummies(X, columns=non_numeric_columns)
y = y - 1
X_test = pd.get_dummies(X_test, columns=non_numeric_columns)

In [61]:
def evaluate_microF1_lgb(preds, train_data):
    labels = train_data.get_label()
    num_classes = len(np.unique(labels))
    preds = preds.reshape(num_classes, -1).argmax(axis=0)
    f1 = f1_score(labels, preds, average='micro')
    return 'microF1', f1, True


# params = {
#     'objective': 'multiclass',
#     'num_class': 3,
#     'boosting': 'gbdt',
#     'metric': 'None',
#     'seed': 42,
# }

params = {
    'objective': 'multiclass',
    'num_class': 3,
    'boosting': 'gbdt',
    'metric': 'None',
    'seed': 42,
    "bagging_fraction": 0.8883295045279088,
    "bagging_freq": 2,
    "feature_fraction": 0.5542528475943183,
    "lambda_l1": 1.4756834962246912e-07,
    "lambda_l2": 0.001245081454069126,
    "min_child_samples": 14
}

In [68]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=3)
best_models = []
val_acc = 0
for train_index, val_index in kf.split(X):
    X_train = X.iloc[train_index, :]
    y_train = y.iloc[train_index, :]

    X_val = X.iloc[val_index, :]
    y_val = y.iloc[val_index, :]

    d_training = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols, free_raw_data=False)
    d_test = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_cols, free_raw_data=False)

    model = lgb.train(params, train_set=d_training, valid_sets=[d_training, d_test], feval=evaluate_microF1_lgb,
                      verbose_eval=25, early_stopping_rounds=200, num_boost_round=2000)

    val_preds = model.predict(X_val).argmax(axis=1)
    val_acc += f1_score(y_val, val_preds, average='micro')
    best_models.append(model)

print("\n ========================================================")
print(f'val_acc: {val_acc/len(best_models): .5f}')


'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


Using categorical_feature in Dataset.


'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10607
[LightGBM] [Info] Number of data points in the train set: 173734, number of used features: 66



Overriding the parameters from Reference Dataset.


categorical_column in param dict is overridden.



[LightGBM] [Info] Start training from score -2.340501
[LightGBM] [Info] Start training from score -0.563022
[LightGBM] [Info] Start training from score -1.095911
Training until validation scores don't improve for 200 rounds
[25]	training's microF1: 0.753733	valid_1's microF1: 0.739855
[50]	training's microF1: 0.769055	valid_1's microF1: 0.745277
[75]	training's microF1: 0.77827	valid_1's microF1: 0.74705
[100]	training's microF1: 0.783071	valid_1's microF1: 0.74758
[125]	training's microF1: 0.787163	valid_1's microF1: 0.747983
[150]	training's microF1: 0.789926	valid_1's microF1: 0.747246
[175]	training's microF1: 0.791791	valid_1's microF1: 0.747476
[200]	training's microF1: 0.793144	valid_1's microF1: 0.747683
[225]	training's microF1: 0.794749	valid_1's microF1: 0.747856
[250]	training's microF1: 0.795866	valid_1's microF1: 0.748052
[275]	training's microF1: 0.796885	valid_1's microF1: 0.747718
[300]	training's microF1: 0.798076	valid_1's microF1: 0.748213
[325]	training's microF1: 


'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


Using categorical_feature in Dataset.


'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10561
[LightGBM] [Info] Number of data points in the train set: 173734, number of used features: 68



Overriding the parameters from Reference Dataset.


categorical_column in param dict is overridden.



[LightGBM] [Info] Start training from score -2.339008
[LightGBM] [Info] Start training from score -0.565015
[LightGBM] [Info] Start training from score -1.092953
Training until validation scores don't improve for 200 rounds
[25]	training's microF1: 0.752754	valid_1's microF1: 0.740178
[50]	training's microF1: 0.769832	valid_1's microF1: 0.746313
[75]	training's microF1: 0.778005	valid_1's microF1: 0.747545
[100]	training's microF1: 0.782196	valid_1's microF1: 0.74819
[125]	training's microF1: 0.785655	valid_1's microF1: 0.748823
[150]	training's microF1: 0.788286	valid_1's microF1: 0.74903
[175]	training's microF1: 0.790076	valid_1's microF1: 0.749099
[200]	training's microF1: 0.791958	valid_1's microF1: 0.749191
[225]	training's microF1: 0.793644	valid_1's microF1: 0.749525
[250]	training's microF1: 0.794882	valid_1's microF1: 0.749157
[275]	training's microF1: 0.796165	valid_1's microF1: 0.749099
[300]	training's microF1: 0.796891	valid_1's microF1: 0.749168
[325]	training's microF1:


'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


Using categorical_feature in Dataset.


'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10572
[LightGBM] [Info] Number of data points in the train set: 173734, number of used features: 68



Overriding the parameters from Reference Dataset.


categorical_column in param dict is overridden.



[LightGBM] [Info] Start training from score -2.337993
[LightGBM] [Info] Start training from score -0.564053
[LightGBM] [Info] Start training from score -1.094878
Training until validation scores don't improve for 200 rounds
[25]	training's microF1: 0.753848	valid_1's microF1: 0.739556
[50]	training's microF1: 0.769798	valid_1's microF1: 0.746037
[75]	training's microF1: 0.778362	valid_1's microF1: 0.747534
[100]	training's microF1: 0.782645	valid_1's microF1: 0.747568
[125]	training's microF1: 0.786058	valid_1's microF1: 0.748201
[150]	training's microF1: 0.788665	valid_1's microF1: 0.748328
[175]	training's microF1: 0.790853	valid_1's microF1: 0.747971
[200]	training's microF1: 0.792113	valid_1's microF1: 0.748144
[225]	training's microF1: 0.793961	valid_1's microF1: 0.748696
[250]	training's microF1: 0.795285	valid_1's microF1: 0.748742
[275]	training's microF1: 0.796597	valid_1's microF1: 0.74903
[300]	training's microF1: 0.797927	valid_1's microF1: 0.748996
[325]	training's microF1

In [69]:
val_probs = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_val = X.iloc[val_index, :]
    val_probs.append(best_models[i].predict(X_val))

val_probs = np.concatenate(val_probs, axis=0)
val_probs

array([[2.55035211e-04, 9.61508595e-02, 9.03594105e-01],
       [2.82510527e-03, 8.02580378e-01, 1.94594517e-01],
       [1.34332425e-03, 2.21777427e-01, 7.76879249e-01],
       ...,
       [2.16822781e-04, 1.25319838e-02, 9.87251193e-01],
       [2.11758990e-01, 7.84060099e-01, 4.18091071e-03],
       [2.52737982e-04, 5.97061897e-02, 9.40041072e-01]])

In [70]:
test_probs = None
for best_model in best_models:
    if test_probs is None:
        test_probs = best_model.predict(X_test, num_iteration=best_model.best_iteration)
    else:
        test_probs += best_model.predict(X_test, num_iteration=best_model.best_iteration)

test_probs /= len(best_models)
test_probs

array([[0.00099906, 0.27957054, 0.7194304 ],
       [0.00110119, 0.90006587, 0.09883294],
       [0.01312887, 0.79558352, 0.19128761],
       ...,
       [0.02682265, 0.85498407, 0.11819328],
       [0.00669397, 0.83543399, 0.15787204],
       [0.5832066 , 0.39083669, 0.02595671]])

In [71]:
version = 1

In [72]:
np.save(DATA_DIR + '/intermediate/lightgbm_{}fold_v{}_train.npy'.format(kf.n_splits, version), val_probs)
np.save(DATA_DIR + '/intermediate/lightgbm_{}fold_v{}_test.npy'.format(kf.n_splits, version), test_probs)

In [32]:
test_preds = np.argmax(test_probs, axis=1) + 1

submission_format = pd.read_csv(DATA_DIR + '/submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=test_preds,
                             columns=submission_format.columns,
                             index=submission_format.index)
my_submission['damage_grade'] = my_submission['damage_grade'].astype(int)
my_submission.to_csv(DATA_DIR + '/submission/lightgbm_{}fold_v{}.csv'.format(kf.n_splits, version))