In [1]:
import os
import gc
import time
import pickle

import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedShuffleSplit
from boostaroota import BoostARoota

In [2]:
PATH_TO_DATA = ('D:/Py/DataFrames/Santander_Value_Prediction_Challenge(KAGGLE)/')

## Reading and joining data

In [3]:
y = pd.read_csv(os.path.join(PATH_TO_DATA, 'input/train.csv'), usecols=['target'])
y = np.log1p(y.target.values)
print('Target length:', len(y))

Target length: 4459


In [14]:
%%time
train_df_statistic = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_with_row_statistic_and_bin_thresh098.csv'))
test_df_statistic = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_with_row_statistic_and_bin_thresh098.csv'))

train_space_reduction = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_space_reduction_thresh098.csv'))
test_space_reduction = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_space_reduction_thresh098.csv'))

Wall time: 1min 34s


In [15]:
%%time
train_df = pd.concat([train_df_statistic, train_space_reduction], axis=1)
test_df = pd.concat([test_df_statistic, test_space_reduction], axis=1)

del train_df_statistic, test_df_statistic
del train_space_reduction, test_space_reduction
gc.collect()

print('Train:', train_df.shape)
print('Test:', test_df.shape)

Train: (4459, 4964)
Test: (49342, 4964)
Wall time: 4.41 s


# Boruta Feature Selection

In [17]:
br = BoostARoota(metric='rmse')
br.fit(train_df, y)

Round:  1  iteration:  1
Round:  1  iteration:  2
Round:  1  iteration:  3
Round:  1  iteration:  4
Round:  1  iteration:  5
Round:  1  iteration:  6
Round:  1  iteration:  7
Round:  1  iteration:  8
Round:  1  iteration:  9
Round:  1  iteration:  10
Round:  2  iteration:  1
Round:  2  iteration:  2
Round:  2  iteration:  3
Round:  2  iteration:  4
Round:  2  iteration:  5
Round:  2  iteration:  6
Round:  2  iteration:  7
Round:  2  iteration:  8
Round:  2  iteration:  9
Round:  2  iteration:  10
Round:  3  iteration:  1
Round:  3  iteration:  2
Round:  3  iteration:  3
Round:  3  iteration:  4
Round:  3  iteration:  5
Round:  3  iteration:  6
Round:  3  iteration:  7
Round:  3  iteration:  8
Round:  3  iteration:  9
Round:  3  iteration:  10
Round:  4  iteration:  1
Round:  4  iteration:  2
Round:  4  iteration:  3
Round:  4  iteration:  4
Round:  4  iteration:  5
Round:  4  iteration:  6
Round:  4  iteration:  7
Round:  4  iteration:  8
Round:  4  iteration:  9
Round:  4  iteration: 

<boostaroota.boostaroota.BoostARoota at 0x18b15248b00>

In [20]:
print('Total features:', train_df.shape[1])
print('Number of selected features:', len(br.keep_vars_.values))

Total features: 4964
Number of selected features: 432


In [24]:
# оставляем только те фичи, которые Boruta посчитал нужными
boruta_features = br.keep_vars_.values
train_df[boruta_features].to_csv(os.path.join(PATH_TO_DATA, 'train_boruta_stat_bin_red_thresh098.csv'), index=False)
test_df[boruta_features].to_csv(os.path.join(PATH_TO_DATA, 'test_boruta_stat_bin_red_thresh098.csv'), index=False)

## CV Folds

In [4]:
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_boruta_stat_bin_red_thresh098.csv'))
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_boruta_stat_bin_red_thresh098.csv'))

In [5]:
def get_20_cv_splits(data, in_path):
    #stratify_classes = y
    train = pd.read_csv(os.path.join(PATH_TO_DATA, 'input/train.csv'), usecols=['target'])
    stratify_classes =  train.target.apply(lambda x: int(np.log10(x)))
    splits = {}
    for random_state in range(20):
        column = np.zeros(data.shape[0])
        sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=random_state)
        for i, (_, test_index) in enumerate(sss.split(data, stratify_classes)):
            column[test_index] = i

        splits["split{}".format(random_state)] = column

    pd.DataFrame(splits, index=data.index).to_csv(os.path.join(PATH_TO_DATA, in_path))

In [6]:
# function to generate 100 folds from create_folds_from_cv_splits func
def create_folds_from_cv_splits(in_path, pkl_path):
    
    cv_splits = pd.read_csv(os.path.join(PATH_TO_DATA, in_path))
    folds_list = []
    for ind, i in enumerate(cv_splits.columns[1:]):
        folds = list(set(cv_splits[i].values))
        folds_list.append([])
        for m in folds:
            val_idx = list(cv_splits[cv_splits[i]==m].index)
            train_idx = list(set(list(cv_splits.index)) - set(val_idx))
            folds_list[ind].append((train_idx, val_idx))
    with open(os.path.join(PATH_TO_DATA, pkl_path), 'wb') as f:
        pickle.dump(folds_list, f)
    return folds_list

In [7]:
LOAD_CV = True

if LOAD_CV:
    with open(os.path.join(PATH_TO_DATA, 'folds/custom_cv_boruta_thresh098.pkl'), 'rb') as f:
        cv_folds = pickle.load(f)
else:
    get_20_cv_splits(train_df, in_path='folds/cv_splits_boruta_stat_bin_red_thresh098.csv')
    cv_folds = create_folds_from_cv_splits(in_path='folds/cv_splits_boruta_stat_bin_red_thresh098.csv',
                                           pkl_path='folds/custom_cv_boruta_thresh098.pkl')

## LightGBM

In [8]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        'max_depth': 7,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.1,
        "bagging_frequency" : 6,
        "bagging_seed" : 44,
        "verbosity" : -1,
        'num_threads' : 4,
        "seed": 44
    }
    
    start_time = time.time()
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 5000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=150)
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    pred_oof_log = model.predict(val_X, num_iteration=model.best_iteration)
    return pred_test_y, pred_oof_log, model

In [9]:
def run_calculations(X, test, big_cv_folds, func_name = None):
    if not func_name:
        return print('The function to run is not defined')
    else:
        y_oof_20_preds = []
        fold_errors_20_preds =[]
        avg_test_pred_20_preds = []
        fold_errors_std = []
        
        for ind, cv_folds in enumerate(big_cv_folds):
            print('Fitting big fold', ind+1, 'out of', len(big_cv_folds))
            y_oof = np.zeros((y.shape[0]))
            fold_errors =[]
            pred_test_list = []
            
            for i, (train_index, val_index) in enumerate(cv_folds):
                print('Fitting sub fold', i+1, 'out of', len(cv_folds))
                X_train, X_val  = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y[train_index], y[val_index]

                # part to include additional functions
                if func_name == 'lgb':
                    pred_test_y, pred_oof_log, clf = run_lgb(X_train, y_train, X_val, y_val, test)
                elif func_name == 'xgb':
                    pred_test_y, pred_oof_log, clf = run_xgb(X_train, y_train, X_val, y_val, test)
                else:
                    return print('The function to run is not correct')

                y_oof[val_index] = pred_oof_log
                curr_fe = np.sqrt(mean_squared_error(y_val, pred_oof_log))
                print(f'Fold error {curr_fe}')
                fold_errors.append(curr_fe)
                pred_test_list.append(list(pred_test_y))

            print('Total error', np.sqrt(mean_squared_error(y, y_oof)))
            total_fe_std = round(np.std(fold_errors), 5)
            print(f'Total std {total_fe_std}')
            avg_test_pred = np.mean(pred_test_list, axis=0)
            
            avg_test_pred_20_preds.append(avg_test_pred)
            fold_errors_20_preds.append(fold_errors)
            y_oof_20_preds.append(y_oof)
            fold_errors_std.append(total_fe_std)
            
        return y_oof_20_preds, avg_test_pred_20_preds, fold_errors_20_preds, fold_errors_std

In [10]:
%%time
y_oof_lgb, pred_test_list_lgb, fold_errors, fold_std = run_calculations(train_df, test_df, cv_folds, 'lgb')

Fitting big fold 1 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.37472	valid_1's rmse: 1.47699
[300]	training's rmse: 1.17894	valid_1's rmse: 1.38463
[450]	training's rmse: 1.04904	valid_1's rmse: 1.34751
[600]	training's rmse: 0.953061	valid_1's rmse: 1.33187
[750]	training's rmse: 0.875047	valid_1's rmse: 1.32576
[900]	training's rmse: 0.808578	valid_1's rmse: 1.3228
[1050]	training's rmse: 0.74878	valid_1's rmse: 1.32161
[1200]	training's rmse: 0.695046	valid_1's rmse: 1.32089
[1350]	training's rmse: 0.64718	valid_1's rmse: 1.32076
Early stopping, best iteration is:
[1255]	training's rmse: 0.676561	valid_1's rmse: 1.32062
Model training done in 10.193679332733154 seconds.
Fold error 1.3206234627326736
Fitting sub fold 2 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.40138	valid_1's rmse: 1.44638
[300]	training's rmse: 1.23042	valid_1's rmse: 1.33666
[450]

[2100]	training's rmse: 0.578548	valid_1's rmse: 1.29703
Early stopping, best iteration is:
[2028]	training's rmse: 0.593222	valid_1's rmse: 1.29674
Model training done in 16.222151041030884 seconds.
Fold error 1.2967371405923904
Total error 1.32502507405
Total std 0.04551
Fitting big fold 3 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.36394	valid_1's rmse: 1.49644
[300]	training's rmse: 1.16086	valid_1's rmse: 1.41387
[450]	training's rmse: 1.0272	valid_1's rmse: 1.38208
[600]	training's rmse: 0.932213	valid_1's rmse: 1.3701
[750]	training's rmse: 0.853153	valid_1's rmse: 1.36517
[900]	training's rmse: 0.788723	valid_1's rmse: 1.36267
[1050]	training's rmse: 0.731264	valid_1's rmse: 1.36218
Early stopping, best iteration is:
[967]	training's rmse: 0.762589	valid_1's rmse: 1.36199
Model training done in 7.893824815750122 seconds.
Fold error 1.361986976918688
Fitting sub fold 2 out of 5
Training until valid

[1350]	training's rmse: 0.755855	valid_1's rmse: 1.31763
[1500]	training's rmse: 0.715586	valid_1's rmse: 1.31638
[1650]	training's rmse: 0.678012	valid_1's rmse: 1.31518
[1800]	training's rmse: 0.641956	valid_1's rmse: 1.31486
Early stopping, best iteration is:
[1837]	training's rmse: 0.633652	valid_1's rmse: 1.31427
Model training done in 15.42061448097229 seconds.
Fold error 1.314266652098182
Total error 1.32831933978
Total std 0.01392
Fitting big fold 5 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.36632	valid_1's rmse: 1.49801
[300]	training's rmse: 1.16968	valid_1's rmse: 1.40437
[450]	training's rmse: 1.04072	valid_1's rmse: 1.36716
[600]	training's rmse: 0.947167	valid_1's rmse: 1.35323
[750]	training's rmse: 0.866865	valid_1's rmse: 1.34637
[900]	training's rmse: 0.800328	valid_1's rmse: 1.34276
[1050]	training's rmse: 0.738956	valid_1's rmse: 1.33993
[1200]	training's rmse: 0.683549	valid_1's rmse

Early stopping, best iteration is:
[1503]	training's rmse: 0.702919	valid_1's rmse: 1.33396
Model training done in 13.00113034248352 seconds.
Fold error 1.3339623741708542
Total error 1.32320411281
Total std 0.02289
Fitting big fold 7 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.38073	valid_1's rmse: 1.45949
[300]	training's rmse: 1.18229	valid_1's rmse: 1.37075
[450]	training's rmse: 1.05124	valid_1's rmse: 1.33601
[600]	training's rmse: 0.955303	valid_1's rmse: 1.32278
[750]	training's rmse: 0.880417	valid_1's rmse: 1.31686
[900]	training's rmse: 0.817568	valid_1's rmse: 1.31397
[1050]	training's rmse: 0.760645	valid_1's rmse: 1.31279
[1200]	training's rmse: 0.707098	valid_1's rmse: 1.31231
Early stopping, best iteration is:
[1164]	training's rmse: 0.719649	valid_1's rmse: 1.31198
Model training done in 9.08918285369873 seconds.
Fold error 1.3119814283099858
Fitting sub fold 2 out of 5
Training until val

Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.3698	valid_1's rmse: 1.49778
[300]	training's rmse: 1.17892	valid_1's rmse: 1.40172
[450]	training's rmse: 1.0521	valid_1's rmse: 1.36125
[600]	training's rmse: 0.957129	valid_1's rmse: 1.3439
[750]	training's rmse: 0.878516	valid_1's rmse: 1.33489
[900]	training's rmse: 0.810655	valid_1's rmse: 1.32988
[1050]	training's rmse: 0.752135	valid_1's rmse: 1.32692
[1200]	training's rmse: 0.700266	valid_1's rmse: 1.32486
[1350]	training's rmse: 0.651522	valid_1's rmse: 1.32351
[1500]	training's rmse: 0.60758	valid_1's rmse: 1.3227
[1650]	training's rmse: 0.565435	valid_1's rmse: 1.32204
[1800]	training's rmse: 0.52675	valid_1's rmse: 1.32168
[1950]	training's rmse: 0.490338	valid_1's rmse: 1.32172
Early stopping, best iteration is:
[1855]	training's rmse: 0.512198	valid_1's rmse: 1.32158
Model training done in 13.855243444442749 seconds.
Fold error 1.3215830082287714
Fitting sub fold 2 out of 5
Training u

Early stopping, best iteration is:
[1081]	training's rmse: 0.827864	valid_1's rmse: 1.31189
Model training done in 9.929903984069824 seconds.
Fold error 1.3118922276801122
Total error 1.32450029604
Total std 0.02235
Fitting big fold 11 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.37661	valid_1's rmse: 1.47391
[300]	training's rmse: 1.1792	valid_1's rmse: 1.38326
[450]	training's rmse: 1.05065	valid_1's rmse: 1.34717
[600]	training's rmse: 0.95707	valid_1's rmse: 1.33397
[750]	training's rmse: 0.882023	valid_1's rmse: 1.32843
[900]	training's rmse: 0.818291	valid_1's rmse: 1.32478
[1050]	training's rmse: 0.758982	valid_1's rmse: 1.32333
[1200]	training's rmse: 0.706026	valid_1's rmse: 1.32172
[1350]	training's rmse: 0.656801	valid_1's rmse: 1.32116
[1500]	training's rmse: 0.612124	valid_1's rmse: 1.32055
Early stopping, best iteration is:
[1448]	training's rmse: 0.627174	valid_1's rmse: 1.32032
Model traini

[1500]	training's rmse: 0.703425	valid_1's rmse: 1.35021
Early stopping, best iteration is:
[1401]	training's rmse: 0.730555	valid_1's rmse: 1.34973
Model training done in 12.437179565429688 seconds.
Fold error 1.3497306738397241
Total error 1.33146680625
Total std 0.01684
Fitting big fold 13 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.36902	valid_1's rmse: 1.48482
[300]	training's rmse: 1.17049	valid_1's rmse: 1.39533
[450]	training's rmse: 1.04129	valid_1's rmse: 1.36069
[600]	training's rmse: 0.944247	valid_1's rmse: 1.34667
[750]	training's rmse: 0.86724	valid_1's rmse: 1.34026
[900]	training's rmse: 0.802341	valid_1's rmse: 1.3362
[1050]	training's rmse: 0.742043	valid_1's rmse: 1.33421
[1200]	training's rmse: 0.690544	valid_1's rmse: 1.3334
[1350]	training's rmse: 0.64355	valid_1's rmse: 1.33255
[1500]	training's rmse: 0.598104	valid_1's rmse: 1.33231
Early stopping, best iteration is:
[1436]	traini

Model training done in 14.706377744674683 seconds.
Fold error 1.2895277868824733
Fitting sub fold 5 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.38375	valid_1's rmse: 1.50551
[300]	training's rmse: 1.20567	valid_1's rmse: 1.41712
[450]	training's rmse: 1.08993	valid_1's rmse: 1.38405
[600]	training's rmse: 1.00735	valid_1's rmse: 1.37184
[750]	training's rmse: 0.942608	valid_1's rmse: 1.36647
[900]	training's rmse: 0.887062	valid_1's rmse: 1.36357
[1050]	training's rmse: 0.836608	valid_1's rmse: 1.36103
[1200]	training's rmse: 0.79099	valid_1's rmse: 1.36002
[1350]	training's rmse: 0.746094	valid_1's rmse: 1.35941
[1500]	training's rmse: 0.705915	valid_1's rmse: 1.35933
Early stopping, best iteration is:
[1461]	training's rmse: 0.715723	valid_1's rmse: 1.3588
Model training done in 12.783884525299072 seconds.
Fold error 1.3587976416995036
Total error 1.32345388169
Total std 0.03183
Fitting big fold 15 out of 20
Fitting sub fold 1 out 

[1050]	training's rmse: 0.854242	valid_1's rmse: 1.33769
[1200]	training's rmse: 0.808552	valid_1's rmse: 1.33641
[1350]	training's rmse: 0.766375	valid_1's rmse: 1.33569
Early stopping, best iteration is:
[1252]	training's rmse: 0.793476	valid_1's rmse: 1.33529
Model training done in 11.400541067123413 seconds.
Fold error 1.3352932060915574
Fitting sub fold 5 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.39024	valid_1's rmse: 1.47729
[300]	training's rmse: 1.21116	valid_1's rmse: 1.38521
[450]	training's rmse: 1.09614	valid_1's rmse: 1.35104
[600]	training's rmse: 1.01376	valid_1's rmse: 1.33793
[750]	training's rmse: 0.948517	valid_1's rmse: 1.33118
[900]	training's rmse: 0.891399	valid_1's rmse: 1.32748
[1050]	training's rmse: 0.841589	valid_1's rmse: 1.32506
[1200]	training's rmse: 0.797674	valid_1's rmse: 1.32415
[1350]	training's rmse: 0.754605	valid_1's rmse: 1.32322
[1500]	training's rmse: 0.71412	valid_1's rmse: 1.32228
[1650]

Fold error 1.2862526805664043
Fitting sub fold 4 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.38837	valid_1's rmse: 1.50143
[300]	training's rmse: 1.20956	valid_1's rmse: 1.42148
[450]	training's rmse: 1.09572	valid_1's rmse: 1.39178
[600]	training's rmse: 1.01607	valid_1's rmse: 1.37896
[750]	training's rmse: 0.950624	valid_1's rmse: 1.37261
[900]	training's rmse: 0.894607	valid_1's rmse: 1.36888
[1050]	training's rmse: 0.845947	valid_1's rmse: 1.36817
[1200]	training's rmse: 0.799583	valid_1's rmse: 1.3672
[1350]	training's rmse: 0.756515	valid_1's rmse: 1.36645
Early stopping, best iteration is:
[1393]	training's rmse: 0.744205	valid_1's rmse: 1.36595
Model training done in 12.478843688964844 seconds.
Fold error 1.3659452749511016
Fitting sub fold 5 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.38116	valid_1's rmse: 1.52524
[300]	training's rmse: 1.20169	valid_1's rmse: 1.43679
[45

[1500]	training's rmse: 0.739798	valid_1's rmse: 1.32617
[1650]	training's rmse: 0.702117	valid_1's rmse: 1.32573
Early stopping, best iteration is:
[1575]	training's rmse: 0.720768	valid_1's rmse: 1.32551
Model training done in 14.461848258972168 seconds.
Fold error 1.3255085640256676
Fitting sub fold 4 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.39652	valid_1's rmse: 1.46608
[300]	training's rmse: 1.21982	valid_1's rmse: 1.37176
[450]	training's rmse: 1.10563	valid_1's rmse: 1.33514
[600]	training's rmse: 1.02375	valid_1's rmse: 1.32141
[750]	training's rmse: 0.957642	valid_1's rmse: 1.31594
[900]	training's rmse: 0.899658	valid_1's rmse: 1.31173
[1050]	training's rmse: 0.848047	valid_1's rmse: 1.30906
[1200]	training's rmse: 0.799302	valid_1's rmse: 1.30735
[1350]	training's rmse: 0.756151	valid_1's rmse: 1.30636
[1500]	training's rmse: 0.715334	valid_1's rmse: 1.30571
[1650]	training's rmse: 0.677133	valid_1's rmse: 1.30494
[1800

In [11]:
print('Avg error:', np.mean(fold_errors))
print('Avg std:', np.mean(fold_std))

Avg error: 1.3209006768
Avg std: 0.0280805


In [12]:
# ERRORS
# errors = pd.DataFrame(fold_errors)
# errors.to_csv(os.path.join(PATH_TO_DATA, 'output/20_fold_errors_lgb_cv1340_std0022.csv'), index=False, header=False)

# 20x TRAIN TARGET OOF PREDS
with open(os.path.join(PATH_TO_DATA, 'output/nikita_train_20_folds_lgb_cv1320_std0028.pkl'), 'wb') as f:
    pickle.dump(y_oof_lgb, f)

# 20x TEST PREDS
with open(os.path.join(PATH_TO_DATA, 'output/nikita_test_20_folds_lgb_cv1320_std0028.pkl'), 'wb') as f:
    pickle.dump(pred_test_list_lgb, f)