In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling
import cufflinks as cf
import plotly.offline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.metrics import f1_score
import os.path as osp

cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
pd.set_option('display.max_columns', 38)

DATA_DIR = "/home/gangda/workspace/ds-richter/data"
split_set = "2_3"

X = pd.read_csv(osp.join(DATA_DIR, 'train_{}_values.csv'.format(split_set)), index_col='building_id')
y = pd.read_csv(osp.join(DATA_DIR, 'train_{}_labels.csv'.format(split_set)), index_col='building_id')
X_test = pd.read_csv(DATA_DIR + '/test_values.csv', index_col='building_id')

cat_cols=['geo_level_1_id','geo_level_2_id','geo_level_3_id',
    #    'age','area_percentage','height_percentage', 'count_families','count_floors_pre_eq',
    #    'land_surface_condition','foundation_type', 'roof_type',
    #    'ground_floor_type', 'other_floor_type', 'position',
    #    'plan_configuration', 'legal_ownership_status',
       'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other',
       
		'land_surface_condition_n', 'land_surface_condition_o',
		'land_surface_condition_t', 'foundation_type_h', 'foundation_type_i',
		'foundation_type_r', 'foundation_type_u', 'foundation_type_w',
		'roof_type_n', 'roof_type_q', 'roof_type_x', 'ground_floor_type_f',
		'ground_floor_type_m', 'ground_floor_type_v', 'ground_floor_type_x',
		'ground_floor_type_z', 'other_floor_type_j', 'other_floor_type_q',
		'other_floor_type_s', 'other_floor_type_x', 'position_j', 'position_o',
		'position_s', 'position_t', 'plan_configuration_a',
		'plan_configuration_c', 'plan_configuration_d', 'plan_configuration_f',
		'plan_configuration_m', 'plan_configuration_n', 'plan_configuration_o',
		'plan_configuration_q', 'plan_configuration_s', 'plan_configuration_u',
		'legal_ownership_status_a', 'legal_ownership_status_r',
		'legal_ownership_status_v', 'legal_ownership_status_w']

categorical_cols=['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']

X = pd.get_dummies(X, columns=categorical_cols)
y = y - 2
X_test = pd.get_dummies(X_test, columns=categorical_cols)

In [32]:
params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'None',
    'seed': 42,
    "bagging_fraction": 0.8883295045279088,
    "bagging_freq": 2,
    "feature_fraction": 0.5542528475943183,
    "lambda_l1": 1.4756834962246912e-07,
    "lambda_l2": 0.001245081454069126,
    "min_child_samples": 14
}

In [33]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

def lgb_f1(preds, train_data):
    labels = train_data.get_label()
    preds= np.where(preds < 0.5, 0, 1)
    f1 = f1_score(labels, preds, average='binary')
    return 'F1', f1, True

kf = KFold(n_splits=5)
best_models = []
val_acc = 0
for train_index, val_index in kf.split(X):
    X_train = X.iloc[train_index, :]
    y_train = y.iloc[train_index, :]

    X_val = X.iloc[val_index, :]
    y_val = y.iloc[val_index, :]

    d_training = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols, free_raw_data=False)
    d_test = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_cols, free_raw_data=False)

    model = lgb.train(params, train_set=d_training, valid_sets=[d_training, d_test], feval=lgb_f1,
                      verbose_eval=100, early_stopping_rounds=200, num_boost_round=2000)

    # val_preds = model.predict(X_val)
    # print(val_preds)
    # val_acc += f1_score(y_val, val_preds)
    best_models.append(model)
    # print(f'val_acc: {val_acc/len(best_models): .5f}')


'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


Using categorical_feature in Dataset.


'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



[LightGBM] [Info] Number of positive: 69694, number of negative: 118687
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10722
[LightGBM] [Info] Number of data points in the train set: 188381, number of used features: 67
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.369963 -> initscore=-0.532376
[LightGBM] [Info] Start training from score -0.532376
Training until validation scores don't improve for 200 rounds



Overriding the parameters from Reference Dataset.


categorical_column in param dict is overridden.



[100]	training's F1: 0.725101	valid_1's F1: 0.700451
[200]	training's F1: 0.737287	valid_1's F1: 0.702488
[300]	training's F1: 0.74137	valid_1's F1: 0.702863
[400]	training's F1: 0.744814	valid_1's F1: 0.704282
[500]	training's F1: 0.747414	valid_1's F1: 0.704059
[600]	training's F1: 0.75088	valid_1's F1: 0.703632
[700]	training's F1: 0.752824	valid_1's F1: 0.70533
[800]	training's F1: 0.75563	valid_1's F1: 0.705164
[900]	training's F1: 0.757684	valid_1's F1: 0.705463
[1000]	training's F1: 0.760104	valid_1's F1: 0.70564
Early stopping, best iteration is:
[890]	training's F1: 0.757423	valid_1's F1: 0.706485



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


Using categorical_feature in Dataset.


'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



[LightGBM] [Info] Number of positive: 69757, number of negative: 118624
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10712
[LightGBM] [Info] Number of data points in the train set: 188381, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.370297 -> initscore=-0.530941
[LightGBM] [Info] Start training from score -0.530941
Training until validation scores don't improve for 200 rounds



Overriding the parameters from Reference Dataset.


categorical_column in param dict is overridden.



[100]	training's F1: 0.728954	valid_1's F1: 0.693767
[200]	training's F1: 0.741108	valid_1's F1: 0.696447
[300]	training's F1: 0.745336	valid_1's F1: 0.697339
[400]	training's F1: 0.748709	valid_1's F1: 0.697934
[500]	training's F1: 0.751655	valid_1's F1: 0.697736
[600]	training's F1: 0.754824	valid_1's F1: 0.697722
Early stopping, best iteration is:
[457]	training's F1: 0.750863	valid_1's F1: 0.698973



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


Using categorical_feature in Dataset.


'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



[LightGBM] [Info] Number of positive: 69820, number of negative: 118562
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10699
[LightGBM] [Info] Number of data points in the train set: 188382, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.370630 -> initscore=-0.529516
[LightGBM] [Info] Start training from score -0.529516
Training until validation scores don't improve for 200 rounds



Overriding the parameters from Reference Dataset.


categorical_column in param dict is overridden.



[100]	training's F1: 0.72743	valid_1's F1: 0.700217
[200]	training's F1: 0.739662	valid_1's F1: 0.703761
[300]	training's F1: 0.743764	valid_1's F1: 0.704966
[400]	training's F1: 0.746871	valid_1's F1: 0.705609
[500]	training's F1: 0.750268	valid_1's F1: 0.70617
[600]	training's F1: 0.752206	valid_1's F1: 0.706717
[700]	training's F1: 0.755058	valid_1's F1: 0.707073
[800]	training's F1: 0.75784	valid_1's F1: 0.706795
Early stopping, best iteration is:
[676]	training's F1: 0.754798	valid_1's F1: 0.707778



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


Using categorical_feature in Dataset.


'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



[LightGBM] [Info] Number of positive: 69838, number of negative: 118544
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10678
[LightGBM] [Info] Number of data points in the train set: 188382, number of used features: 67
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.370725 -> initscore=-0.529106
[LightGBM] [Info] Start training from score -0.529106
Training until validation scores don't improve for 200 rounds



Overriding the parameters from Reference Dataset.


categorical_column in param dict is overridden.



[100]	training's F1: 0.725231	valid_1's F1: 0.696472
[200]	training's F1: 0.737748	valid_1's F1: 0.701372
[300]	training's F1: 0.742792	valid_1's F1: 0.701681
[400]	training's F1: 0.746192	valid_1's F1: 0.70152
[500]	training's F1: 0.749329	valid_1's F1: 0.702335
[600]	training's F1: 0.752451	valid_1's F1: 0.701473
[700]	training's F1: 0.754612	valid_1's F1: 0.701972
[800]	training's F1: 0.757297	valid_1's F1: 0.702138
[900]	training's F1: 0.759973	valid_1's F1: 0.703568
[1000]	training's F1: 0.761356	valid_1's F1: 0.704259
[1100]	training's F1: 0.763388	valid_1's F1: 0.704291
[1200]	training's F1: 0.766262	valid_1's F1: 0.703162
Early stopping, best iteration is:
[1005]	training's F1: 0.761777	valid_1's F1: 0.704884



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


Using categorical_feature in Dataset.


'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



[LightGBM] [Info] Number of positive: 69763, number of negative: 118619
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10709
[LightGBM] [Info] Number of data points in the train set: 188382, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.370327 -> initscore=-0.530813
[LightGBM] [Info] Start training from score -0.530813



Overriding the parameters from Reference Dataset.


categorical_column in param dict is overridden.



Training until validation scores don't improve for 200 rounds
[100]	training's F1: 0.729082	valid_1's F1: 0.701057
[200]	training's F1: 0.740367	valid_1's F1: 0.702207
[300]	training's F1: 0.745017	valid_1's F1: 0.703107
[400]	training's F1: 0.748108	valid_1's F1: 0.702754
[500]	training's F1: 0.751057	valid_1's F1: 0.703509
[600]	training's F1: 0.753952	valid_1's F1: 0.703481
[700]	training's F1: 0.756021	valid_1's F1: 0.70368
[800]	training's F1: 0.758526	valid_1's F1: 0.704193
Early stopping, best iteration is:
[644]	training's F1: 0.754685	valid_1's F1: 0.704767


In [34]:
val_probs = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_val = X.iloc[val_index, :]
    val_probs.append(best_models[i].predict(X_val))

val_probs = np.concatenate(val_probs, axis=0)
val_probs

array([0.80456197, 0.07488217, 0.82034384, ..., 0.979163  , 0.00266198,
       0.92753116])

In [35]:
test_probs = None
for best_model in best_models:
    if test_probs is None:
        test_probs = best_model.predict(X_test, num_iteration=best_model.best_iteration)
    else:
        test_probs += best_model.predict(X_test, num_iteration=best_model.best_iteration)

test_probs /= len(best_models)
test_probs

array([0.69614389, 0.08379004, 0.20708981, ..., 0.13067967, 0.14235603,
       0.06230433])

In [36]:
np.save(DATA_DIR + '/intermediate/lightgbm_{}fold_{}_train.npy'.format(kf.n_splits, split_set), val_probs)
np.save(DATA_DIR + '/intermediate/lightgbm_{}fold_{}_test.npy'.format(kf.n_splits, split_set), test_probs)