In [1]:
import os
import gc
import time
import pickle

import pandas as pd
import numpy as np

import lightgbm as lgb
import xgboost as xgb

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

from bayes_opt import BayesianOptimization
# import keras
# import tensorflow as tf
# from keras.models import Sequential, Model
# from keras.layers import (Conv1D, Dense, Activation, BatchNormalization, Lambda,
#                           Dropout, InputLayer, Input, GlobalMaxPool1D, Flatten)
# from keras.wrappers.scikit_learn import KerasClassifier

  from numpy.core.umath_tests import inner1d


In [2]:
PATH_TO_DATA = os.path.abspath('../')+'/'

In [3]:
DATA_PATH = '../input/'
PRED_TEST_PATH = '../submit/'
PRED_TRAIN_PATH = '../submit/'
FOLDS_PATH = '../kvr777/folds/'
FEATURES_PATH = '../features/'
MODEL_NAME = 'leonid09blend'

In [4]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [5]:
def read_pickle(fpath):
    with open(fpath, 'rb') as fin:
        return pickle.load(fin)
    
BASE_PATH = '../submit/'

In [6]:
!ls $BASE_PATH

artgor_20folds_test_catboost.pkl
artgor_20folds_train_catboost.pkl
ashevelev_20folds_test_mxnet_RF_cv1789.pkl
ashevelev_20folds_test_RF_cv1337_std0028.pkl
ashevelev_20folds_train_mxnet_RF_cv1789.pkl
ashevelev_20folds_train_RF_cv1337_std0028.pkl
blend0820_without_leak_averaged_cv1278_std0007.csv
blendings
chislov_20folds_test_lgbm_cv1360_std0019.pkl
chislov_20folds_train_lgbm_cv1360_std0019.pkl
egorlabintcev_20folds_test_lgb_cv13406.pkl
fanran_LEAK_TRAINTEST_CV20x5_1.3688.csv
insaf_20_fold_train_xgb_gp_clus_cv133_std0021_2ndkernel.pkl
insaf_20_folds_test_xgb_gp_clus_cv133_std0021_2ndkernel.pkl
iv_cv1404_wo_leak.csv
izmaylov_20folds_test_cv1323_std0021.pkl
izmaylov_20folds_train_cv1323_std0021.pkl
jiazhen-to-armamut-via-gurchetan1000-compiled-leak
leonid04_20folds_test_lgbm_cv1317_std0007.pkl
leonid04_20folds_train_lgbm_cv1317_std0007.pkl
leonid05_20folds_test_averaged_cv1319_std0007.csv
leonid05_20folds_test_cv1319_std0007.pkl
leonid05_20folds_train_cv1319_std0007.pkl
leonid06_20folds_t

## CV Folds

In [7]:
def get_20_cv_splits(data):
    #stratify_classes = y
    train = pd.read_csv(os.path.join(PATH_TO_DATA, 'input/train.csv'), usecols=['target'])
    stratify_classes =  train.target.apply(lambda x: int(np.log10(x)))
    splits = {}
    for random_state in range(20):
        column = np.zeros(data.shape[0])
        sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=random_state)
        for i, (_, test_index) in enumerate(sss.split(data, stratify_classes)):
            column[test_index] = i

        splits["split{}".format(random_state)] = column

    pd.DataFrame(splits, index=data.index).to_csv(os.path.join(PATH_TO_DATA, 'folds/cv_splits_cleandata_stat_bin_red.csv'))

In [8]:
 # function to generate 100 folds from create_folds_from_cv_splits func
def create_folds_from_cv_splits(in_path):
    cv_splits = pd.read_csv(os.path.join(PATH_TO_DATA, in_path))
    folds_list = []
    for ind, i in enumerate(cv_splits.columns[1:]):
        folds = list(set(cv_splits[i].values))
        folds_list.append([])
        for m in folds:
            val_idx = list(cv_splits[cv_splits[i]==m].index)
            train_idx = list(set(list(cv_splits.index)) - set(val_idx))
            folds_list[ind].append((train_idx, val_idx))
    with open(os.path.join(PATH_TO_DATA, 'kvr777/folds/custom_cv.pkl'), 'wb') as f:
        pickle.dump(folds_list, f)
    return folds_list

In [9]:
LOAD_CV = True

if LOAD_CV:
    with open(os.path.join(PATH_TO_DATA, 'kvr777/folds/custom_cv.pkl'), 'rb') as f:
        cv_folds = pickle.load(f)
else:
    get_20_cv_splits(train_df)
    cv_folds = create_folds_from_cv_splits(in_path='kvr777/folds/cv_splits_cleandata_stat_bin_red.csv')

In [10]:
def more_features(d):
    return np.concatenate([d, 
                           np.min(d, axis=-1, keepdims=True),
                           np.max(d, axis=-1, keepdims=True),
                           np.mean(d, axis=-1, keepdims=True),
                           np.median(d, axis=-1, keepdims=True)], axis=-1)

In [11]:
test_data = ['ashevelev_20folds_test_mxnet_RF_cv1789.pkl',#
             'chislov_20folds_test_lgbm_cv1360_std0019.pkl',#
             'tenich_20folds_test_1dconvnn_cv1561_std0021.pkl',#
             'egorlabintcev_20folds_test_lgb_cv13406.pkl',
             'leonid04_20folds_test_lgbm_cv1317_std0007.pkl',
             'nefedov_20folds_test_xgb_cv1328_std003.pkl',
             'tenich_20folds_test_baggedlgb_cv1335_std0021.csv',
             'artgor_20folds_test_catboost.pkl',
             'insaf_20_folds_test_xgb_gp_clus_cv133_std0021_2ndkernel.pkl',
             'izmaylov_20folds_test_cv1323_std0021.pkl',
             'ashevelev_20folds_test_RF_cv1337_std0028.pkl',
             'leonid05_20folds_test_cv1319_std0007.pkl',
             'leonid06_20folds_test_cv1302_std0007.pkl',
             'leonid07_20folds_test_cv1303_std0007.pkl',
             'leonid08_20folds_test_cv1303_std0007.pkl',
             'Nikita/alexpengxiao_kernel_test_20_folds_lgb_cv1356_std0029.pkl',#
             'Nikita/alexpengxiao_kernel_test_20_folds_xgb_cv1348_std0027.pkl',
             'Nikita/nikita_b_test_20_folds_xgb_cv1323_std0027.pkl',
             'Nikita/nikita_test_20_folds_lgb_cv1340_std0027.pkl',
             'Nikita/nikita_test_20_folds_lgb_cv1341_std0026.pkl',
             'Nikita/nikita_test_20_folds_lgb_cv1348_std0029.pkl',#
             'Nikita/nikita_test_20_folds_lgb_cv1320_std0028.pkl',
             'vykhand01_20folds_test_lgbm_cv1322_std0029.pkl',
             'nefedov_20folds_train_xgb_cv1319_std003.pkl',
             'new_zhav1kwell_ert_cv1318_std003.pkl'
             ]

train_data = ['ashevelev_20folds_train_mxnet_RF_cv1789.pkl',#
              'chislov_20folds_train_lgbm_cv1360_std0019.pkl',#
              'tenich_20folds_train_1dconvnn_cv1561_std0021.pkl',#
              'oof_basic_pipeline_20folds_13406.pkl',
              'leonid04_20folds_train_lgbm_cv1317_std0007.pkl',
              'nefedov_20folds_train_xgb_cv1328_std003.pkl',
              'tenich_20folds_train_baggedlgb_cv1335_std0021.csv',
              'artgor_20folds_train_catboost.pkl',
              'insaf_20_fold_train_xgb_gp_clus_cv133_std0021_2ndkernel.pkl',
              'izmaylov_20folds_train_cv1323_std0021.pkl',
              'ashevelev_20folds_train_RF_cv1337_std0028.pkl',
              'leonid05_20folds_train_cv1319_std0007.pkl',
              'leonid06_20folds_train_cv1302_std0007.pkl',
              'leonid07_20folds_train_cv1303_std0007.pkl',
              'leonid08_20folds_train_cv1303_std0007.pkl',
              'Nikita/alexpengxiao_kernel_train_20_folds_lgb_cv1356_std0029.pkl',#
              'Nikita/alexpengxiao_kernel_train_20_folds_xgb_cv1348_std0027.pkl',
              'Nikita/nikita_b_train_20_folds_xgb_cv1323_std0027.pkl',
              'Nikita/nikita_train_20_folds_lgb_cv1340_std0027.pkl',
              'Nikita/nikita_train_20_folds_lgb_cv1341_std0026.pkl',
              'Nikita/nikita_train_20_folds_lgb_cv1348_std0029.pkl',#
              'Nikita/nikita_train_20_folds_lgb_cv1320_std0028.pkl',
              'vykhand01_20folds_train_lgbm_cv1322_std0029.pkl',
              'nefedov_20folds_train_xgb_cv1319_std003.pkl',
              'zhav1kwell_20_folds_train_ert_cv1318_std003.pkl'
              ]

In [12]:
%%time
# train_preds = np.array([read_pickle(BASE_PATH + f) for f in train_data])
# test_preds = np.array([read_pickle(BASE_PATH + f) for f in test_data])

# X = [train_preds[:, i, :] for i in range(20)]
# test = [test_preds[:, i, :] for i in range(20)]

X = pd.concat([pd.DataFrame(read_pickle(BASE_PATH + f)).T for f in train_data], axis = 1)
print('X done')
test = pd.concat([pd.DataFrame(read_pickle(BASE_PATH + f)).T for f in test_data], axis = 1)
print('test done')
# for i in range(20):
#     X[i][:, 0] = np.log1p(X[i][:, 0])
# #     X[i][:, 4] = np.log1p(X[i][:, 4])
#     X[i][:, 7] = np.log1p(X[i][:, 7])
#     X[i][:, 10] = np.log1p(X[i][:, 10])
    
test = np.log1p(test)

train_df = pd.read_csv(f'{PATH_TO_DATA}input/train.csv')
y = np.log1p(train_df.target)
print(X.shape, y.shape, test.shape)
del train_df
gc.collect()

X done
test done
(4459, 500) (4459,) (49342, 500)
Wall time: 1min 1s


In [13]:
X.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10,11,12,13,14,15,16,17,18,19
0,2832042.0,2962803.0,2594375.0,3143294.0,2759915.0,2347741.0,2486025.0,2950824.0,2840005.0,3105339.0,...,14.767484,14.747596,14.689561,14.698137,14.669238,14.70212,14.765604,14.676999,14.749081,14.746367
1,1848837.0,1992488.0,2207480.0,2366350.0,1682422.0,2009920.0,1597052.0,2273976.0,2317608.0,1849735.0,...,14.668626,14.758232,14.749869,14.690854,14.675423,14.670137,14.556115,14.711833,14.760038,14.784759
2,2861534.0,3325314.0,3233440.0,3488722.0,3502675.0,3003053.0,2482458.0,3461111.0,3375058.0,2970180.0,...,14.833868,14.802847,14.837573,14.856471,14.883386,14.881556,14.857297,14.919303,14.832991,14.777038
3,869732.9,782985.9,948031.4,919406.7,937636.9,818358.2,995962.8,966305.2,910418.9,868280.8,...,13.733369,13.784835,13.582351,13.7854,13.606475,13.60049,13.814911,13.73885,13.692593,13.706627
4,4146797.0,3992424.0,3754497.0,3889899.0,4201063.0,4009800.0,3958549.0,3205308.0,3628521.0,3838257.0,...,14.976959,14.903025,15.062302,14.962032,14.955784,15.001003,14.987674,14.97736,14.982414,15.041061
5,3232196.0,2361613.0,1291814.0,1445467.0,2341916.0,2540010.0,3402041.0,3463169.0,3390596.0,2923406.0,...,14.720076,14.736186,14.593845,14.689543,14.743393,14.720136,14.833548,14.811833,14.853132,14.768808
6,327782.7,148351.5,243878.6,226926.0,284141.2,268984.4,253532.8,273404.3,351370.8,285217.8,...,11.636044,11.912775,11.729613,11.755063,11.767263,11.620009,11.69345,11.719832,11.714067,11.86906
7,4540922.0,4509616.0,6118600.0,4233030.0,4416945.0,5575118.0,3323755.0,4211592.0,4541200.0,4511588.0,...,15.058416,15.168764,15.247369,15.067891,15.094034,15.254368,15.044469,15.222354,15.117875,15.151168
8,2221663.0,2021820.0,3202764.0,2064614.0,2211733.0,2806549.0,2084794.0,4172736.0,2977960.0,2723344.0,...,14.468447,14.429849,14.4957,14.437555,14.435801,14.461508,14.442009,14.579465,14.528498,14.509268
9,959781.5,900297.4,1123993.0,982692.6,796208.1,1003426.0,1071410.0,915174.1,1090330.0,1012308.0,...,14.530877,14.648685,14.477749,14.610083,14.509849,14.541207,14.6787,14.609271,14.589014,14.614146


In [14]:
test.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10,11,12,13,14,15,16,17,18,19
0,14.911401,14.911401,14.911401,14.911401,14.911401,14.911401,14.911401,14.911401,14.911401,14.911401,...,14.701627,14.684726,14.725431,14.745341,14.705209,14.716565,14.703124,14.728486,14.734382,14.687252
1,14.27712,14.27712,14.27712,14.27712,14.27712,14.27712,14.27712,14.27712,14.27712,14.27712,...,13.93589,13.927885,13.943929,13.952025,13.946252,13.961022,13.921171,13.959312,13.967677,13.943854
2,14.720255,14.720255,14.720255,14.720255,14.720255,14.720255,14.720255,14.720255,14.720255,14.720255,...,14.50662,14.48219,14.504688,14.51832,14.504975,14.484002,14.496314,14.537933,14.519676,14.526373
3,15.241589,15.241589,15.241589,15.241589,15.241589,15.241589,15.241589,15.241589,15.241589,15.241589,...,15.306287,15.301992,15.288202,15.273772,15.290146,15.314761,15.277124,15.313032,15.298357,15.314534
4,14.684171,14.684171,14.684171,14.684171,14.684171,14.684171,14.684171,14.684171,14.684171,14.684171,...,14.358911,14.34763,14.396201,14.39043,14.378745,14.365091,14.343846,14.363702,14.373572,14.365174
5,14.921523,14.921523,14.921523,14.921523,14.921523,14.921523,14.921523,14.921523,14.921523,14.921523,...,14.151391,14.116652,14.136599,14.154159,14.149712,14.137335,14.119978,14.144437,14.137026,14.145014
6,14.711668,14.711668,14.711668,14.711668,14.711668,14.711668,14.711668,14.711668,14.711668,14.711668,...,14.129798,14.109647,14.14749,14.13036,14.132148,14.147171,14.154362,14.137867,14.166525,14.141281
7,15.022814,15.022814,15.022814,15.022814,15.022814,15.022814,15.022814,15.022814,15.022814,15.022814,...,14.639775,14.626651,14.640331,14.650334,14.626303,14.612437,14.631202,14.642324,14.634284,14.623593
8,14.874045,14.874045,14.874045,14.874045,14.874045,14.874045,14.874045,14.874045,14.874045,14.874045,...,14.455282,14.422562,14.451239,14.460968,14.444224,14.420442,14.427907,14.434768,14.436729,14.440587
9,14.803345,14.803345,14.803345,14.803345,14.803345,14.803345,14.803345,14.803345,14.803345,14.803345,...,14.770785,14.816562,14.789981,14.793529,14.813369,14.785049,14.789854,14.779319,14.812291,14.798659


In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 500 entries, 0 to 19
dtypes: float64(500)
memory usage: 17.0 MB


In [16]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49342 entries, 0 to 49341
Columns: 500 entries, 0 to 19
dtypes: float64(500)
memory usage: 188.2 MB


In [17]:
# for i in range(len(train_preds)):
#     cur_scores = [mean_squared_error(y, X[j][:, i]) ** 0.5 for j in range(20)]
#     print(np.mean(cur_scores), np.std(cur_scores))

In [18]:
# test.min(), test.max(), np.array(X).min(), np.array(X).max()

In [19]:
# def get_model():
# #     input = Input(shape=(len(test_preds), ))
    
# #     dense_1 = Dense(4, activation='relu')(input)
    
# #     output = Dense(1)(dense_1)
    
# #     model = Model(input, output)
    
# #     model.compile('adam', 'mse')
    
#     return Ridge(alpha=10)

In [20]:
# model = get_model()

In [21]:
# def run_model(train_X, train_y, val_X, val_y, test_X):
#     start_time = time.time()
    
#     model = get_model()
#     model.fit(train_X, train_y)#, batch_size=64, epochs=100, validation_data=(val_X, val_y))
    
#     print('Model training done in {} seconds.'.format(time.time() - start_time))
    
#     pred_test_y = model.predict(test_X).ravel()
#     pred_oof_log = model.predict(val_X).ravel()
    
#     print(train_y.max(), val_y.max())

#     return pred_test_y, pred_oof_log, model

In [22]:
def run_model(train_X, train_y, val_X, val_y, test_X, seed = RANDOM_STATE):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 512,#40
        'max_depth': 5,
        "learning_rate" : 0.004,#0.005
        "bagging_fraction" : 0.45,
        "feature_fraction" : 0.97,#0.6
        "bagging_freq" : 1,
        "verbosity" : -1,
        'num_threads' : 4,
        "seed": seed
    }
    
    start_time = time.time()
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 10000, 
                      valid_sets=[lgtrain, lgval], 
                      verbose_eval=200, 
                      early_stopping_rounds=200)
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    pred_oof_log = model.predict(val_X, num_iteration=model.best_iteration)
    
    print(train_y.max(), val_y.max())
    
    return pred_test_y, pred_oof_log, model

In [23]:
def run_calculations(X, test, big_cv_folds, func_name = None):
    if not func_name:
        return print('The function to run is not defined')
    else:
        y_oof_20_preds = []
        fold_errors_20_preds =[]
        avg_test_pred_20_preds = []
        models_20_preds = []
        
        start_time = time.time()
        for ind, cv_folds in enumerate(big_cv_folds):
#             print('Fitting big fold', ind+1, 'out of', len(big_cv_folds))
            y_oof = np.zeros((y.shape[0]))
            fold_errors =[]
            pred_test_list = []
            models = []
            
            for i, (train_index, val_index) in enumerate(cv_folds):
                print('Fitting big fold', ind+1, 'out of', len(big_cv_folds),
                      'and sub fold', i+1, 'out of', len(cv_folds))
#                 X_train, X_val  = X[ind][train_index], X[ind][val_index]
#                 y_train, y_val = y[train_index], y[val_index]
                X_train, X_val  = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y[train_index], y[val_index]

                # part to include additional functions
                if func_name == 'lgb':
#                     pred_test_y, pred_oof_log, clf = run_model(X_train, y_train, X_val, y_val, test[ind],
#                                                              seed = (RANDOM_STATE + ind*i))
                    pred_test_y, pred_oof_log, clf = run_model(X_train, y_train, X_val, y_val, test,
                                                             seed = (RANDOM_STATE + ind*i))
                    models.append(clf)
                else:
                    return print('The function to run is not correct')

                y_oof[val_index] = pred_oof_log
                curr_fe = np.sqrt(mean_squared_error(y_val, pred_oof_log))
                print(f'Fold error {curr_fe}')
                fold_errors.append(curr_fe)
                pred_test_list.append(list(pred_test_y))
                print('Time passed: {} seconds.'.format(time.time() - start_time))

            print('Average big fold', ind+1, 'error:', np.sqrt(mean_squared_error(y, y_oof)))
            total_fe_std = round(np.std(fold_errors), 5)
            print(f'Total big fold {ind+1} std {total_fe_std}')
            avg_test_pred = np.mean(pred_test_list, axis=0)
            
            avg_test_pred_20_preds.append(avg_test_pred)
            fold_errors_20_preds.append(fold_errors)
            y_oof_20_preds.append(y_oof)
            models_20_preds.append(models)
            
    return y_oof_20_preds, avg_test_pred_20_preds, fold_errors_20_preds, models_20_preds

In [24]:
def lgb_evaluate(max_depth,
                 num_leaves,
                 bagging_fraction,
                 feature_fraction,
                 bagging_freq):
    params = {
        "objective": "regression",
        "metric": "rmse",
        # "num_leaves": 361,  # 40
        # 'max_depth': 21,
        "learning_rate": 0.04,  # 0.005
        # "bagging_fraction": 0.7,
        # "feature_fraction": 0.4,  # 0.6
        # "bagging_freq": 5,
        "verbosity": -1,
        'num_threads': 4,
        "seed": random_state
    }

    params['max_depth'] = int(max_depth)
    params['num_leaves'] = int(num_leaves)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['feature_fraction'] = max(feature_fraction, 0)
    params['bagging_freq'] = int(bagging_freq)

    cv_result = lgb.cv(params, lgtrain, num_boost_round=num_rounds, nfold=5,
                       seed=random_state,
                       verbose_eval=20,
                       stratified=False, #have to add, because of objective regression
                       early_stopping_rounds=50)

    return -cv_result['rmse-mean'][-1]



In [None]:
%%time
lgtrain = lgb.Dataset(X, label=y)

num_rounds = 100
random_state = 2018
num_iter = 50
init_points = 8
# params = {
#     'eta': 0.1,
#     'silent': 1,
#     'eval_metric': 'mae',
#     'verbose_eval': True,
#     'seed': random_state
# }

lgbBO = BayesianOptimization(lgb_evaluate, {'max_depth': (5, 23),
                                            'num_leaves': (150, 2048),
                                            'bagging_fraction': (0.4, 0.9),
                                            'feature_fraction': (0.8, 0.99),
                                            'bagging_freq': (1, 4),
                                            })
lgbBO.explore({'max_depth': [8, 5, 8, 13, 21],
               'num_leaves': [188, 40, 80, 361, 512],
               'bagging_fraction': [0.7417, 0.9, 0.5, 0.6, 0.4],
               'feature_fraction': [0.9884, 0.8, 0.99, 0.8, 0.99],
               'bagging_freq': [2, 1, 2, 2, 1],
               })
lgbBO.maximize(init_points=init_points, n_iter=num_iter)

# Finally, we take a look at the final results.
print(lgbBO.res['max'])
print(lgbBO.res['all'])

---

In [25]:
%%time
y_oof_lgb, pred_test_list_lgb, fold_errors, _ = run_calculations(X, test, cv_folds, 'lgb')

Fitting big fold 1 out of 20 and sub fold 1 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.31158	valid_1's rmse: 1.38838
[400]	training's rmse: 1.15203	valid_1's rmse: 1.3005
[600]	training's rmse: 1.07669	valid_1's rmse: 1.28113
[800]	training's rmse: 1.02286	valid_1's rmse: 1.27589
[1000]	training's rmse: 0.981258	valid_1's rmse: 1.27391
[1200]	training's rmse: 0.940506	valid_1's rmse: 1.27325
[1400]	training's rmse: 0.904548	valid_1's rmse: 1.27285
[1600]	training's rmse: 0.869702	valid_1's rmse: 1.2737
Early stopping, best iteration is:
[1439]	training's rmse: 0.897725	valid_1's rmse: 1.27278
Model training done in 23.80232048034668 seconds.
17.50439003707821 17.50439003707821
Fold error 1.2727760092726583
Time passed: 26.230388641357422 seconds.
Fitting big fold 1 out of 20 and sub fold 2 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.33192	valid_1's rmse: 1.35333
[400]	training's r

[1000]	training's rmse: 0.977842	valid_1's rmse: 1.27376
Early stopping, best iteration is:
[908]	training's rmse: 0.998717	valid_1's rmse: 1.27306
Model training done in 18.94931173324585 seconds.
17.50439003707821 17.50439003707821
Fold error 1.2730641738447022
Time passed: 299.0283226966858 seconds.
Fitting big fold 3 out of 20 and sub fold 2 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.33357	valid_1's rmse: 1.35366
[400]	training's rmse: 1.19179	valid_1's rmse: 1.25127
[600]	training's rmse: 1.12708	valid_1's rmse: 1.22444
[800]	training's rmse: 1.08278	valid_1's rmse: 1.2145
[1000]	training's rmse: 1.04618	valid_1's rmse: 1.21231
[1200]	training's rmse: 1.01169	valid_1's rmse: 1.20992
[1400]	training's rmse: 0.977366	valid_1's rmse: 1.21028
Early stopping, best iteration is:
[1225]	training's rmse: 1.00759	valid_1's rmse: 1.20965
Model training done in 26.107792854309082 seconds.
17.50439003707821 17.50439003707821
Fold error 1.2

Model training done in 20.732223987579346 seconds.
17.50439003707821 17.50439003707821
Fold error 1.2698104139500026
Time passed: 582.3625359535217 seconds.
Fitting big fold 5 out of 20 and sub fold 2 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.33253	valid_1's rmse: 1.36357
[400]	training's rmse: 1.18889	valid_1's rmse: 1.26786
[600]	training's rmse: 1.12251	valid_1's rmse: 1.24406
[800]	training's rmse: 1.0793	valid_1's rmse: 1.23914
[1000]	training's rmse: 1.04298	valid_1's rmse: 1.2394
Early stopping, best iteration is:
[883]	training's rmse: 1.06448	valid_1's rmse: 1.23833
Model training done in 18.341240406036377 seconds.
17.50439003707821 17.50439003707821
Fold error 1.2383254710806757
Time passed: 602.4843242168427 seconds.
Fitting big fold 5 out of 20 and sub fold 3 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.32962	valid_1's rmse: 1.36744
[400]	training's rmse: 1.1851	valid

Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.3322	valid_1's rmse: 1.32651
[400]	training's rmse: 1.18898	valid_1's rmse: 1.23843
[600]	training's rmse: 1.12452	valid_1's rmse: 1.22404
[800]	training's rmse: 1.08065	valid_1's rmse: 1.22291
[1000]	training's rmse: 1.04268	valid_1's rmse: 1.22387
Early stopping, best iteration is:
[840]	training's rmse: 1.07314	valid_1's rmse: 1.22242
Model training done in 20.937936305999756 seconds.
17.50439003707821 17.50439003707821
Fold error 1.2224223788574695
Time passed: 896.2828195095062 seconds.
Fitting big fold 7 out of 20 and sub fold 4 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.32365	valid_1's rmse: 1.40358
[400]	training's rmse: 1.17908	valid_1's rmse: 1.31901
[600]	training's rmse: 1.11329	valid_1's rmse: 1.2989
[800]	training's rmse: 1.06853	valid_1's rmse: 1.29168
[1000]	training's rmse: 1.02989	valid_1's rmse: 1.28741
[1200]	training's rmse: 

[1000]	training's rmse: 1.0348	valid_1's rmse: 1.26581
[1200]	training's rmse: 0.999365	valid_1's rmse: 1.26396
[1400]	training's rmse: 0.967452	valid_1's rmse: 1.26394
[1600]	training's rmse: 0.937955	valid_1's rmse: 1.26405
[1800]	training's rmse: 0.909265	valid_1's rmse: 1.26258
[2000]	training's rmse: 0.880964	valid_1's rmse: 1.26181
[2200]	training's rmse: 0.854727	valid_1's rmse: 1.26109
Early stopping, best iteration is:
[2176]	training's rmse: 0.857597	valid_1's rmse: 1.26056
Model training done in 40.25103259086609 seconds.
17.50439003707821 17.50439003707821
Fold error 1.2605606915746326
Time passed: 1205.3064641952515 seconds.
Fitting big fold 9 out of 20 and sub fold 4 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.32078	valid_1's rmse: 1.41126
[400]	training's rmse: 1.17676	valid_1's rmse: 1.32137
[600]	training's rmse: 1.11151	valid_1's rmse: 1.30282
[800]	training's rmse: 1.06772	valid_1's rmse: 1.29972
[1000]	training's 

Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.32767	valid_1's rmse: 1.38042
[400]	training's rmse: 1.1824	valid_1's rmse: 1.2928
[600]	training's rmse: 1.11579	valid_1's rmse: 1.27341
[800]	training's rmse: 1.07066	valid_1's rmse: 1.2675
[1000]	training's rmse: 1.03348	valid_1's rmse: 1.26705
[1200]	training's rmse: 0.997278	valid_1's rmse: 1.26393
[1400]	training's rmse: 0.965053	valid_1's rmse: 1.26296
[1600]	training's rmse: 0.934169	valid_1's rmse: 1.26233
[1800]	training's rmse: 0.904828	valid_1's rmse: 1.2616
[2000]	training's rmse: 0.877706	valid_1's rmse: 1.26127
[2200]	training's rmse: 0.852665	valid_1's rmse: 1.26121
Early stopping, best iteration is:
[2114]	training's rmse: 0.863459	valid_1's rmse: 1.2609
Model training done in 40.226168155670166 seconds.
17.50439003707821 17.50439003707821
Fold error 1.2609022760413195
Time passed: 1529.5628237724304 seconds.
Fitting big fold 11 out of 20 and sub fold 5 out of 5
Training until valid

[400]	training's rmse: 1.18158	valid_1's rmse: 1.28465
[600]	training's rmse: 1.11454	valid_1's rmse: 1.25871
[800]	training's rmse: 1.07059	valid_1's rmse: 1.25088
[1000]	training's rmse: 1.03341	valid_1's rmse: 1.24693
[1200]	training's rmse: 0.999206	valid_1's rmse: 1.24445
[1400]	training's rmse: 0.966014	valid_1's rmse: 1.24273
[1600]	training's rmse: 0.935116	valid_1's rmse: 1.24256
[1800]	training's rmse: 0.906053	valid_1's rmse: 1.24213
[2000]	training's rmse: 0.879301	valid_1's rmse: 1.24192
Early stopping, best iteration is:
[1860]	training's rmse: 0.897758	valid_1's rmse: 1.2416
Model training done in 34.495545864105225 seconds.
17.50439003707821 17.50439003707821
Fold error 1.2415978678757524
Time passed: 1827.2854216098785 seconds.
Average big fold 13 error: 1.2728356414564723
Total big fold 13 std 0.01723
Fitting big fold 14 out of 20 and sub fold 1 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.31432	valid_1's rmse: 1.377

Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.30692	valid_1's rmse: 1.38425
[400]	training's rmse: 1.14821	valid_1's rmse: 1.30254
[600]	training's rmse: 1.07489	valid_1's rmse: 1.28594
[800]	training's rmse: 1.02547	valid_1's rmse: 1.28259
[1000]	training's rmse: 0.982502	valid_1's rmse: 1.28224
Early stopping, best iteration is:
[975]	training's rmse: 0.987969	valid_1's rmse: 1.28203
Model training done in 18.615153312683105 seconds.
17.50439003707821 17.50439003707821
Fold error 1.2820324368498253
Time passed: 2110.464861869812 seconds.
Fitting big fold 16 out of 20 and sub fold 2 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.32922	valid_1's rmse: 1.39475
[400]	training's rmse: 1.18781	valid_1's rmse: 1.28792
[600]	training's rmse: 1.12327	valid_1's rmse: 1.2614
[800]	training's rmse: 1.07992	valid_1's rmse: 1.25617
[1000]	training's rmse: 1.04282	valid_1's rmse: 1.25285
[1200]	training's rm

[600]	training's rmse: 1.12573	valid_1's rmse: 1.23382
[800]	training's rmse: 1.08334	valid_1's rmse: 1.2249
[1000]	training's rmse: 1.04614	valid_1's rmse: 1.22405
[1200]	training's rmse: 1.01246	valid_1's rmse: 1.22133
[1400]	training's rmse: 0.981298	valid_1's rmse: 1.21891
[1600]	training's rmse: 0.951997	valid_1's rmse: 1.21701
[1800]	training's rmse: 0.923211	valid_1's rmse: 1.21604
[2000]	training's rmse: 0.896246	valid_1's rmse: 1.2146
[2200]	training's rmse: 0.870565	valid_1's rmse: 1.2146
Early stopping, best iteration is:
[2006]	training's rmse: 0.895344	valid_1's rmse: 1.21439
Model training done in 39.97116756439209 seconds.
17.50439003707821 17.50439003707821
Fold error 1.2143903512504235
Time passed: 2429.0913548469543 seconds.
Fitting big fold 18 out of 20 and sub fold 3 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.32024	valid_1's rmse: 1.42467
[400]	training's rmse: 1.17457	valid_1's rmse: 1.35038
[600]	training's rms

Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.33013	valid_1's rmse: 1.36115
[400]	training's rmse: 1.18775	valid_1's rmse: 1.26317
[600]	training's rmse: 1.123	valid_1's rmse: 1.23898
[800]	training's rmse: 1.07938	valid_1's rmse: 1.23051
[1000]	training's rmse: 1.04197	valid_1's rmse: 1.22725
[1200]	training's rmse: 1.00774	valid_1's rmse: 1.22722
Early stopping, best iteration is:
[1175]	training's rmse: 1.01146	valid_1's rmse: 1.22691
Model training done in 24.905840158462524 seconds.
17.50439003707821 17.50439003707821
Fold error 1.2269128940237504
Time passed: 2742.8653507232666 seconds.
Fitting big fold 20 out of 20 and sub fold 4 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.32305	valid_1's rmse: 1.39748
[400]	training's rmse: 1.1775	valid_1's rmse: 1.30552
[600]	training's rmse: 1.10987	valid_1's rmse: 1.28377
[800]	training's rmse: 1.06507	valid_1's rmse: 1.27978
[1000]	training's rmse

In [26]:
%%time
avg_error = np.mean(([np.mean(x) for x in fold_errors]))
avg_std = np.std(([np.mean (x) for x in fold_errors]))
print(f'Average error across 20 folds: {avg_error.round(5)}')
print(f'Average std across 20 folds: {avg_std.round(5)}')
print(f'(averaged std of 20 folds: {np.mean(([np.std (x) for x in fold_errors])).round(5)})')

Average error across 20 folds: 1.26893
Average std across 20 folds: 0.00604
(averaged std of 20 folds: 0.02762)
Wall time: 4 ms


In [27]:
print('Length of test predictions:', len(pred_test_list_lgb))
# avg_pred_test_list_lgb = np.expm1(np.mean(pred_test_list_lgb, axis=0))
avg_pred_test_list_lgb = np.mean(pred_test_list_lgb, axis=0)
print('Length of avg test predictions:', len(avg_pred_test_list_lgb))

Length of test predictions: 20
Length of avg test predictions: 49342


In [28]:
avg_pred_test_list_lgb.min(), avg_pred_test_list_lgb.max()

(107842.8297417948, 19432101.594394095)

In [None]:
# ERRORS
# errors = pd.DataFrame(fold_errors)
# errors.to_csv(os.path.join(PATH_TO_DAT`A, 'output/tenich_20_fold_errors_1dconvnn_cv1620_std0037.csv'), index=False, header=False)

# 20x oof train preds
# with open(os.path.join(PATH_TO_DATA, 'output/tenich_20folds_train_1dconvnn_cv1561_std0021.csv'), 'wb') as f:
#     pickle.dump(y_oof_lgb, f)
    
# #20x test preds
# with open(os.path.join(PATH_TO_DATA, 'output/tenich_20folds_test_1dconvnn_cv1561_std0021.csv'), 'wb') as f:
#     pickle.dump(pred_test_list_lgb, f)

In [29]:
%%time
sub_df = pd.read_csv(f'{DATA_PATH}sample_submission.csv')#, usecols = ['ID']
model_to_submit = pd.DataFrame({ 'ID': sub_df['ID'].values,
                            'target': avg_pred_test_list_lgb})
model_to_submit.to_csv(f'{PRED_TEST_PATH}{MODEL_NAME}_20folds_test_averaged_cv{avg_error*1000:0>4.0f}_std{avg_std*1000:0>4.0f}.csv',
                       index=False)

Wall time: 230 ms


In [30]:
model_to_submit.head(10)

Unnamed: 0,ID,target
0,000137c73,6782998.0
1,00021489f,1637664.0
2,0004d7953,5344150.0
3,00056a333,7729168.0
4,00056d8eb,2359130.0
5,0005fc190,3197165.0
6,000787e86,3808208.0
7,0008510a0,4069301.0
8,000895faf,2682486.0
9,000986fba,3279674.0


In [31]:
model_to_submit.tail(10)

Unnamed: 0,ID,target
49332,ffef8aa08,2492344.0
49333,fff0ee67d,3323852.0
49334,fff2aa673,1615774.0
49335,fff479492,1153307.0
49336,fff64bf93,3187633.0
49337,fff73b677,3230444.0
49338,fff7b5923,6304897.0
49339,fff7c698f,4067467.0
49340,fff8dba89,321241.0
49341,fffbe2f6f,6405764.0


In [None]:
test = pd.read_csv(os.path.join(PATH_TO_DATA, 'input/test.csv'), usecols=['ID'])

lgb = pd.DataFrame({'ID': test['ID'].values,
                    'target': avg_pred_test_list_lgb})

lgb.to_csv(os.path.join(PATH_TO_DATA, 'output/tenich_ridge_blending_12855_0026.csv'), index=False)

In [None]:
best_sub_without_leak = lgb.copy()

with open('./05_submission_with_leak_0814/have_data_test.obj', 'rb') as f:
    have_data_test = pickle.load(f)
    
sub_with_leak = pd.read_csv(f'./05_submission_with_leak_0814/kvr777_5f_lgb_from0701_with_leak.csv') 

best_sub_without_leak.loc[have_data_test, 'target'] = sub_with_leak.loc[have_data_test, 'target'] 

best_sub_without_leak.to_csv(os.path.join(PATH_TO_DATA, 'output/tenich_ridge_blending_12855_0026_with_leak.csv'), index=False)