In [1]:
import os
import gc
import time
import pickle

import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
PATH_TO_DATA = ('D:/Py/DataFrames/Santander_Value_Prediction_Challenge(KAGGLE)/')

## Reading and joining data

In [3]:
y = pd.read_csv(os.path.join(PATH_TO_DATA, 'input/train.csv'), usecols=['target'])
y = np.log1p(y.target.values)
print('Target length:', len(y))

Target length: 4459


In [4]:
%%time
train_df_statistic_and_bin = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_with_row_statistic_and_bin_thresh098.csv'))
test_df_statistic_and_bin = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_with_row_statistic_and_bin_thresh098.csv'))

train_space_reduction = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_space_reduction_50comp.csv'))
test_space_reduction = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_space_reduction_50comp.csv'))

Wall time: 1min 50s


In [5]:
# Use only FA and SRP features
cols_to_drop = [x for x in train_space_reduction.columns if str(x)[:2] not in ['fa', 'sr']]
print('Drop', len(cols_to_drop), 'columns')
train_space_reduction = train_space_reduction.drop(cols_to_drop, axis=1, inplace=True)
test_space_reduction = test_space_reduction.drop(cols_to_drop, axis=1, inplace=True)

Drop 600 columns


In [6]:
%%time
train_df = pd.concat([train_df_statistic_and_bin, train_space_reduction], axis=1)
test_df = pd.concat([test_df_statistic_and_bin, test_space_reduction], axis=1)

del train_df_statistic_and_bin, test_df_statistic_and_bin
del train_space_reduction, test_space_reduction
gc.collect()

print('Train:', train_df.shape)
print('Test:', test_df.shape)

Train: (4459, 4264)
Test: (49342, 4264)
Wall time: 2.54 s


## CV Folds

In [7]:
def get_20_cv_splits(data, in_path):
    #stratify_classes = y
    train = pd.read_csv(os.path.join(PATH_TO_DATA, 'input/train.csv'), usecols=['target'])
    stratify_classes =  train.target.apply(lambda x: int(np.log10(x)))
    splits = {}
    for random_state in range(20):
        column = np.zeros(data.shape[0])
        sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=random_state)
        for i, (_, test_index) in enumerate(sss.split(data, stratify_classes)):
            column[test_index] = i

        splits["split{}".format(random_state)] = column

    pd.DataFrame(splits, index=data.index).to_csv(os.path.join(PATH_TO_DATA, in_path))

In [8]:
# function to generate 100 folds from create_folds_from_cv_splits func
def create_folds_from_cv_splits(in_path, pkl_path):
    
    cv_splits = pd.read_csv(os.path.join(PATH_TO_DATA, in_path))
    folds_list = []
    for ind, i in enumerate(cv_splits.columns[1:]):
        folds = list(set(cv_splits[i].values))
        folds_list.append([])
        for m in folds:
            val_idx = list(cv_splits[cv_splits[i]==m].index)
            train_idx = list(set(list(cv_splits.index)) - set(val_idx))
            folds_list[ind].append((train_idx, val_idx))
    with open(os.path.join(PATH_TO_DATA, pkl_path), 'wb') as f:
        pickle.dump(folds_list, f)
    return folds_list

In [9]:
LOAD_CV = False

if LOAD_CV:
    with open(os.path.join(PATH_TO_DATA, 'folds/custom_cv_thresh098.pkl'), 'rb') as f:
        cv_folds = pickle.load(f)
else:
    get_20_cv_splits(train_df, in_path='folds/cv_splits_cleandata_stat_bin_red_thresh098.csv')
    cv_folds = create_folds_from_cv_splits(in_path='folds/cv_splits_cleandata_stat_bin_red_thresh098.csv',
                                           pkl_path='folds/custom_cv_thresh098.pkl')

## XGBoost

In [10]:
def run_xgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        'objective': 'reg:linear',
        'booster': 'gbtree',
        'metric': 'rmse',
        'learning_rate': 0.005,
        'max_depth': 8,
        'subsample': 0.7,
        'colsample_bytree': 0.1,
        'alpha':0,
        'lambda': 0,
        'gamma': 1.5,
        'silent': True,
        'random_state': 44,
        'nthread': -1
    }
    
    start_time = time.time()
    xgtrain = xgb.DMatrix(train_X, label=train_y)
    xgval = xgb.DMatrix(val_X, label=val_y)
    xgtest = xgb.DMatrix(test_X)
    watchlist = [(xgtrain, 'train'), (xgval, 'valid')]
    model = xgb.train(params, xgtrain, 2000, watchlist, 
                      early_stopping_rounds=50, 
                      verbose_eval=100)
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_y = np.expm1(model.predict(xgtest, ntree_limit=model.best_ntree_limit))
    pred_oof_log = model.predict(xgval, ntree_limit=model.best_ntree_limit)
    return pred_test_y, pred_oof_log, model

## LightGBM

In [10]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        'max_depth': 8, # -1,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.1, # 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 44,
        "verbosity" : -1,
        'num_threads' : 4,
        "seed": 44
    }
    
    start_time = time.time()
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 5000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=150)
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    pred_oof_log = model.predict(val_X, num_iteration=model.best_iteration)
    return pred_test_y, pred_oof_log, model

In [11]:
def run_calculations(X, test, big_cv_folds, func_name = None):
    if not func_name:
        return print('The function to run is not defined')
    else:
        y_oof_20_preds = []
        fold_errors_20_preds =[]
        avg_test_pred_20_preds = []
        fold_errors_std = []
        
        for ind, cv_folds in enumerate(big_cv_folds):
            print('Fitting big fold', ind+1, 'out of', len(big_cv_folds))
            y_oof = np.zeros((y.shape[0]))
            fold_errors =[]
            pred_test_list = []
            
            for i, (train_index, val_index) in enumerate(cv_folds):
                print('Fitting sub fold', i+1, 'out of', len(cv_folds))
                X_train, X_val  = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y[train_index], y[val_index]

                # part to include additional functions
                if func_name == 'lgb':
                    pred_test_y, pred_oof_log, clf = run_lgb(X_train, y_train, X_val, y_val, test)
                elif func_name == 'xgb':
                    pred_test_y, pred_oof_log, clf = run_xgb(X_train, y_train, X_val, y_val, test)
                else:
                    return print('The function to run is not correct')

                y_oof[val_index] = pred_oof_log
                curr_fe = np.sqrt(mean_squared_error(y_val, pred_oof_log))
                print(f'Fold error {curr_fe}')
                fold_errors.append(curr_fe)
                pred_test_list.append(list(pred_test_y))

            print('Total error', np.sqrt(mean_squared_error(y, y_oof)))
            total_fe_std = round(np.std(fold_errors), 5)
            print(f'Total std {total_fe_std}')
            avg_test_pred = np.mean(pred_test_list, axis=0)
            
            avg_test_pred_20_preds.append(avg_test_pred)
            fold_errors_20_preds.append(fold_errors)
            y_oof_20_preds.append(y_oof)
            fold_errors_std.append(total_fe_std)
            
        return y_oof_20_preds, avg_test_pred_20_preds, fold_errors_20_preds, fold_errors_std

---

In [38]:
%%time
y_oof_lgb, pred_test_list_lgb, fold_errors, fold_std = run_calculations(train_df, test_df, cv_folds, 'lgb')

Fitting big fold 1 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.41342	valid_1's rmse: 1.48133
[300]	training's rmse: 1.25129	valid_1's rmse: 1.38516
[450]	training's rmse: 1.1597	valid_1's rmse: 1.34971
[600]	training's rmse: 1.09648	valid_1's rmse: 1.33414
[750]	training's rmse: 1.05199	valid_1's rmse: 1.32888
[900]	training's rmse: 1.01819	valid_1's rmse: 1.32769
[1050]	training's rmse: 0.991393	valid_1's rmse: 1.3276
Early stopping, best iteration is:
[999]	training's rmse: 1.00024	valid_1's rmse: 1.32752
Model training done in 21.002872228622437 seconds.
Fold error 1.3275214733499638
Fitting sub fold 2 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.42497	valid_1's rmse: 1.45788
[300]	training's rmse: 1.27312	valid_1's rmse: 1.34938
[450]	training's rmse: 1.19072	valid_1's rmse: 1.31285
[600]	training's rmse: 1.13402	valid_1's rmse: 1.29862
[750]	trainin

[150]	training's rmse: 1.42232	valid_1's rmse: 1.46743
[300]	training's rmse: 1.26767	valid_1's rmse: 1.37744
[450]	training's rmse: 1.18277	valid_1's rmse: 1.34822
[600]	training's rmse: 1.12443	valid_1's rmse: 1.34034
[750]	training's rmse: 1.08484	valid_1's rmse: 1.3373
[900]	training's rmse: 1.05394	valid_1's rmse: 1.33755
Early stopping, best iteration is:
[818]	training's rmse: 1.07002	valid_1's rmse: 1.33702
Model training done in 25.68719792366028 seconds.
Fold error 1.3370187848901829
Fitting sub fold 4 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.41757	valid_1's rmse: 1.50252
[300]	training's rmse: 1.26472	valid_1's rmse: 1.39823
[450]	training's rmse: 1.18226	valid_1's rmse: 1.35905
[600]	training's rmse: 1.12427	valid_1's rmse: 1.34221
[750]	training's rmse: 1.08577	valid_1's rmse: 1.33468
[900]	training's rmse: 1.05545	valid_1's rmse: 1.33165
Early stopping, best iteration is:
[936]	training's rmse: 1.04997	valid_1's rmse

Model training done in 38.74387240409851 seconds.
Fold error 1.3387662368304756
Total error 1.34020174441
Total std 0.01049
Fitting big fold 6 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.40544	valid_1's rmse: 1.48246
[300]	training's rmse: 1.23949	valid_1's rmse: 1.39218
[450]	training's rmse: 1.14529	valid_1's rmse: 1.36049
[600]	training's rmse: 1.07963	valid_1's rmse: 1.34838
[750]	training's rmse: 1.03455	valid_1's rmse: 1.34463
[900]	training's rmse: 1.00146	valid_1's rmse: 1.34374
Early stopping, best iteration is:
[819]	training's rmse: 1.01803	valid_1's rmse: 1.34359
Model training done in 17.293269395828247 seconds.
Fold error 1.3435944146575804
Fitting sub fold 2 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.42353	valid_1's rmse: 1.50208
[300]	training's rmse: 1.27462	valid_1's rmse: 1.3857
[450]	training's rmse: 1.19405	valid_1's rmse: 1.33958


[750]	training's rmse: 1.08062	valid_1's rmse: 1.39637
[900]	training's rmse: 1.05029	valid_1's rmse: 1.39615
Early stopping, best iteration is:
[873]	training's rmse: 1.05534	valid_1's rmse: 1.39585
Model training done in 27.582966804504395 seconds.
Fold error 1.3958481021803664
Fitting sub fold 4 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.41792	valid_1's rmse: 1.49218
[300]	training's rmse: 1.26452	valid_1's rmse: 1.39547
[450]	training's rmse: 1.18043	valid_1's rmse: 1.35965
[600]	training's rmse: 1.12275	valid_1's rmse: 1.34446
[750]	training's rmse: 1.08324	valid_1's rmse: 1.3389
[900]	training's rmse: 1.05367	valid_1's rmse: 1.3359
[1050]	training's rmse: 1.03181	valid_1's rmse: 1.33528
[1200]	training's rmse: 1.01268	valid_1's rmse: 1.33424
Early stopping, best iteration is:
[1158]	training's rmse: 1.01816	valid_1's rmse: 1.33404
Model training done in 29.075162649154663 seconds.
Fold error 1.3340396634575116
Fitting sub fold

Fold error 1.3227133049669528
Total error 1.33664968207
Total std 0.01652
Fitting big fold 11 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.41733	valid_1's rmse: 1.48132
[300]	training's rmse: 1.25402	valid_1's rmse: 1.39042
[450]	training's rmse: 1.16155	valid_1's rmse: 1.35839
[600]	training's rmse: 1.09806	valid_1's rmse: 1.34543
[750]	training's rmse: 1.05314	valid_1's rmse: 1.34154
[900]	training's rmse: 1.0182	valid_1's rmse: 1.34021
[1050]	training's rmse: 0.989179	valid_1's rmse: 1.3405
Early stopping, best iteration is:
[1003]	training's rmse: 0.9976	valid_1's rmse: 1.34004
Model training done in 19.55488085746765 seconds.
Fold error 1.340037040941307
Fitting sub fold 2 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.42583	valid_1's rmse: 1.45808
[300]	training's rmse: 1.27465	valid_1's rmse: 1.35439
[450]	training's rmse: 1.19258	valid_1's rmse: 1.3

[150]	training's rmse: 1.39883	valid_1's rmse: 1.50516
[300]	training's rmse: 1.23707	valid_1's rmse: 1.41217
[450]	training's rmse: 1.14509	valid_1's rmse: 1.37925
[600]	training's rmse: 1.08225	valid_1's rmse: 1.36562
[750]	training's rmse: 1.03879	valid_1's rmse: 1.36067
[900]	training's rmse: 1.00599	valid_1's rmse: 1.35909
[1050]	training's rmse: 0.980928	valid_1's rmse: 1.35896
Early stopping, best iteration is:
[994]	training's rmse: 0.989272	valid_1's rmse: 1.35873
Model training done in 19.417481422424316 seconds.
Fold error 1.3587316505503721
Fitting sub fold 2 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.42269	valid_1's rmse: 1.47513
[300]	training's rmse: 1.26962	valid_1's rmse: 1.38314
[450]	training's rmse: 1.18638	valid_1's rmse: 1.34787
[600]	training's rmse: 1.12913	valid_1's rmse: 1.3317
[750]	training's rmse: 1.0902	valid_1's rmse: 1.32525
[900]	training's rmse: 1.05948	valid_1's rmse: 1.32223
[1050]	training's rmse

Model training done in 29.038577795028687 seconds.
Fold error 1.3121036975715032
Fitting sub fold 4 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.41418	valid_1's rmse: 1.50552
[300]	training's rmse: 1.2568	valid_1's rmse: 1.42469
[450]	training's rmse: 1.17062	valid_1's rmse: 1.39488
[600]	training's rmse: 1.11313	valid_1's rmse: 1.38298
[750]	training's rmse: 1.07425	valid_1's rmse: 1.37769
[900]	training's rmse: 1.04347	valid_1's rmse: 1.3753
[1050]	training's rmse: 1.02009	valid_1's rmse: 1.37511
Early stopping, best iteration is:
[977]	training's rmse: 1.03168	valid_1's rmse: 1.37444
Model training done in 27.9110369682312 seconds.
Fold error 1.374440207760824
Fitting sub fold 5 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.40949	valid_1's rmse: 1.52678
[300]	training's rmse: 1.25431	valid_1's rmse: 1.43293
[450]	training's rmse: 1.16725	valid_1's rmse: 1.40061
[600]	training's rms

In [4]:
print('Avg error:', np.mean(fold_errors))
print('Avg std:', np.mean(fold_std))

Avg error: 1.34039566247
Avg std: 0.027104


In [40]:
# ERRORS
# errors = pd.DataFrame(fold_errors)
# errors.to_csv(os.path.join(PATH_TO_DATA, 'output/20_fold_errors_lgb_cv1340_std0027.csv'), index=False, header=False)

# 20x TRAIN TARGET OOF PREDS
with open(os.path.join(PATH_TO_DATA, 'output/nikita_train_20_folds_lgb_cv1340_std0027.pkl'), 'wb') as f:
    pickle.dump(y_oof_lgb, f)

# 20x TEST PREDS
with open(os.path.join(PATH_TO_DATA, 'output/nikita_test_20_folds_lgb_cv1340_std0027.pkl'), 'wb') as f:
    pickle.dump(pred_test_list_lgb, f)