# Modules and settings

In [1]:
import os
import pandas as pd
import numpy as np
import gc
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
try:
    import cPickle as pickle
except:
    import pickle
import time

In [2]:
DATA_PATH = '../input/'
PRED_TEST_PATH = '../submit/'
PRED_TRAIN_PATH = '../submit/'
FOLDS_PATH = '../kvr777/folds/'
FEATURES_PATH = '../features/'
MODEL_NAME = 'leonid07'

In [3]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [4]:
def convert_col_to_proper_int(df_col):
    col_type = df_col.dtype
    c_min = df_col.min()
    c_max = df_col.max()
#     print('convert_col_to_proper_int column: ', df_col.name, 'type: ', col_type, 'c_min: ', c_min)
    if ((str(col_type)[:3] == 'int') | (str(col_type)[:4] == 'uint')): # | (str(col_type)[:5] == 'float')
        if c_min < 0:
#             print('c_min: ', c_min, 'less 0')
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df_col = df_col.astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df_col = df_col.astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df_col = df_col.astype(np.int32)
            elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                df_col = df_col.astype(np.int64)
        else:
#             print('c_min: ', c_min, 'not less 0')
            if c_max <= np.iinfo(np.uint8).max:
                df_col = df_col.astype(np.uint8)
            elif c_max <= np.iinfo(np.uint16).max:
                df_col = df_col.astype(np.uint16)
            elif c_max <= np.iinfo(np.uint32).max:
                df_col = df_col.astype(np.uint32)
            elif c_max <= np.iinfo(np.uint64).max:
                df_col = df_col.astype(np.uint64)
            
    return df_col

def convert_col_to_proper_float(df_col):
    col_type = df_col.dtype
    if str(col_type)[:5] == 'float':
        unique_count = len(np.unique(df_col))
        df_col_temp = df_col.astype(np.float32)
        if len(np.unique(df_col_temp)) == unique_count:
            df_col = df_col_temp
            c_min = df_col.min()
            c_max = df_col.max()
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df_col_temp = df_col.astype(np.float16)
                if len(np.unique(df_col_temp)) == unique_count:
                    df_col = df_col_temp
            
    return df_col



def float_to_int(df):
    """ iterate through all float columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_to_int')
    for col in df.columns:
        col_type = df[col].dtype
#         print('column: ', col, 'type: ', col_type)
        if str(col_type)[:5] == 'float':
            if (df[col] % 1 == 0).all():
                df[col] = convert_col_to_proper_int(df[col].astype(np.int64))
    
    return df

def float_reduced(df):
    """ iterate through all float columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_reduced')
    for col in df.columns:
        col_type = df[col].dtype
#         print('column: ', col, 'type: ', col_type)
        if str(col_type)[:5] == 'float':
            df[col] = convert_col_to_proper_float(df[col])
    
    return df

def int_reduced(df):
    """ iterate through all int columns of a dataframe and modify the data type
        to reduce memory usage.
    """
#     print('Begin float_reduced')
    for col in df.columns:
        df[col] = convert_col_to_proper_int(df[col])
    
    return df


# Load data

In [5]:
%%time
# Loading from pickle example
crowded_features_data_store = f'{FEATURES_PATH}crowded_features_data_store.pkl'
if os.path.isfile(crowded_features_data_store):
    print("loading data from pickle file", crowded_features_data_store)
    with open(os.path.abspath(crowded_features_data_store), 'rb') as f:
        crowded_features_df, _, _, _ = pickle.load(f, encoding='bytes')
        print('crowded_features_df:', type(crowded_features_df), crowded_features_df.shape)
#         print('target:', type(target), target.shape)
#         print('train_idx_rng:', type(train_idx_rng), 'start:', train_idx_rng.start,
#               'stop:', train_idx_rng.stop, 'step:', train_idx_rng.step)
#         print('test_idx_rng:', type(test_idx_rng), 'start:', test_idx_rng.start,
#               'stop:', test_idx_rng.stop, 'step:', test_idx_rng.step)

izmajlovkonstantin3_original_data_store = f'{FEATURES_PATH}izmajlovkonstantin3_original_data_store.pkl'
if os.path.isfile(izmajlovkonstantin3_original_data_store):
    print("loading data from pickle file", izmajlovkonstantin3_original_data_store)
    with open(os.path.abspath(izmajlovkonstantin3_original_data_store), 'rb') as f:
        total_df, target, train_idx_rng, test_idx_rng = pickle.load(f, encoding='bytes')
        print('total_df:', type(total_df), total_df.shape)
        print('target:', type(target), target.shape)
        print('train_idx_rng:', type(train_idx_rng), 'start:', train_idx_rng.start,
              'stop:', train_idx_rng.stop, 'step:', train_idx_rng.step)
        print('test_idx_rng:', type(test_idx_rng), 'start:', test_idx_rng.start,
              'stop:', test_idx_rng.stop, 'step:', test_idx_rng.step)

loading data from pickle file ../features/crowded_features_data_store.pkl
crowded_features_df: <class 'pandas.core.frame.DataFrame'> (53801, 5369)
loading data from pickle file ../features/izmajlovkonstantin3_original_data_store.pkl
total_df: <class 'pandas.core.frame.DataFrame'> (53801, 5495)
target: <class 'numpy.ndarray'> (4459,)
train_idx_rng: <class 'range'> start: 0 stop: 4459 step: 1
test_idx_rng: <class 'range'> start: 4459 stop: 53801 step: 1
Wall time: 3.58 s


In [6]:
%%time
total_df=int_reduced(float_reduced(float_to_int(total_df)))
total_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53801 entries, 0 to 53800
Columns: 5495 entries, 48df886f9 to 66ace2992/b791ce9aa
dtypes: float32(4814), float64(680), int32(1)
memory usage: 1.2 GB
Wall time: 1h 1min 15s


In [7]:
len(set(crowded_features_df.columns)-set(total_df.columns))

2670

In [8]:
len(set(total_df.columns)-set(crowded_features_df.columns))

2796

In [9]:
%%time
total_df = total_df.join(crowded_features_df.reset_index(drop=True),
              lsuffix='_izmajlov', rsuffix='_crowded', how='outer').fillna(0)#, inplace=True
# total_df.merge(crowded_features_df.reset_index(drop=True),
#               suffixes=('_izmajlov', '_crowded'), how='outer').fillna(0, inplace=True)
# total_df = pd.concat([total_df, crowded_features_df[list(len(set(total_df.columns)-set(crowded_features_df.columns)))],
#                          clustering_features,
#                          gp_clustering_ii], axis=1, verify_integrity=True).fillna(0)
total_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53801 entries, 0 to 53800
Columns: 10864 entries, 48df886f9 to gp_clustering_ii_275
dtypes: float16(6), float32(7267), float64(1463), int32(1), uint16(4), uint8(2123)
memory usage: 2.2 GB
Wall time: 9.09 s


In [10]:
%%time
X = total_df.iloc[train_idx_rng]
test = total_df.iloc[test_idx_rng]

Wall time: 1.68 s


In [11]:
X.shape, test.shape

((4459, 10864), (49342, 10864))

In [12]:
%%time
# y = np.log1p(target.target)
y = np.log1p(target)
# y.reshape(1,-1)
y.shape

Wall time: 0 ns


In [13]:
%%time
from boostaroota import BoostARoota
br = BoostARoota(metric='rmse', iters = 20)
br.fit(X, y)

Round:  1  iteration:  1
Round:  1  iteration:  2
Round:  1  iteration:  3
Round:  1  iteration:  4
Round:  1  iteration:  5
Round:  1  iteration:  6
Round:  1  iteration:  7
Round:  1  iteration:  8
Round:  1  iteration:  9
Round:  1  iteration:  10
Round:  1  iteration:  11
Round:  1  iteration:  12
Round:  1  iteration:  13
Round:  1  iteration:  14
Round:  1  iteration:  15
Round:  1  iteration:  16
Round:  1  iteration:  17
Round:  1  iteration:  18
Round:  1  iteration:  19
Round:  1  iteration:  20
Round:  2  iteration:  1
Round:  2  iteration:  2
Round:  2  iteration:  3
Round:  2  iteration:  4
Round:  2  iteration:  5
Round:  2  iteration:  6
Round:  2  iteration:  7
Round:  2  iteration:  8
Round:  2  iteration:  9
Round:  2  iteration:  10
Round:  2  iteration:  11
Round:  2  iteration:  12
Round:  2  iteration:  13
Round:  2  iteration:  14
Round:  2  iteration:  15
Round:  2  iteration:  16
Round:  2  iteration:  17
Round:  2  iteration:  18
Round:  2  iteration:  19
Roun

In [14]:
remaining_vars = list(br.keep_vars_)
print(len(remaining_vars))

784


In [27]:
remaining_vars

['48df886f9',
 '34b15f335',
 '30347e683',
 '6ee66e115',
 '20aa07010_izmajlov',
 'dc5a8f1d8_izmajlov',
 '11d86fa6a_izmajlov',
 '4681de4fd_izmajlov',
 'adf119b9a_izmajlov',
 'cff75dd09',
 'b8a716ebf_izmajlov',
 '6c7a4567c',
 '4fcfd2b4d',
 'f3b9c0b95',
 'd966ac62c',
 '68b647452_izmajlov',
 '0d866c3d7_izmajlov',
 'bd8f989f1_izmajlov',
 '22ed6dba3_izmajlov',
 '92b13ebba_izmajlov',
 '233c7c17c_izmajlov',
 '2cb4d123e_izmajlov',
 '87ffda550_izmajlov',
 '822e49b95_izmajlov',
 'd04e16aed_izmajlov',
 'ec863cb52',
 'ea18d720e_izmajlov',
 '408d86ce9_izmajlov',
 'ecdef52b2_izmajlov',
 '2d6bd8275_izmajlov',
 'dfdf4b580_izmajlov',
 'feed9d437_izmajlov',
 '645b47cde_izmajlov',
 '7298ca1ef_izmajlov',
 '8c94b6675_izmajlov',
 'e421c414e',
 '0656586a4_izmajlov',
 '0b8e10df6_izmajlov',
 'f115e74c0_izmajlov',
 '54b1c1bc0_izmajlov',
 'bd550871c_izmajlov',
 '21e0e6ae3_izmajlov',
 '64534cc93_izmajlov',
 '0f8d7b98e_izmajlov',
 '251d1aa17_izmajlov',
 '0a69cc2be_izmajlov',
 '963a49cdc_izmajlov',
 '587a5d8c3_izmajl

In [29]:
%%time
new_crowded_features_data_store = f'{FEATURES_PATH}crowded_features_v2_data_store.pkl'
print( "Saving data...")
with open(os.path.abspath(new_crowded_features_data_store), 'wb') as f:
    pickle.dump((total_df,
                 target,
                 range(0, len(target)),
                 range(len(target), len(total_df))),
                f, protocol = pickle.HIGHEST_PROTOCOL)
    print('Saved to', os.path.abspath(new_crowded_features_data_store))

new_crowded_features_data_store = f'{FEATURES_PATH}crowded_features_v2_boosted_data_store.pkl'
print( "Saving data...")
with open(os.path.abspath(new_crowded_features_data_store), 'wb') as f:
    pickle.dump((total_df[remaining_vars],
                 target,
                 range(0, len(target)),
                 range(len(target), len(total_df))),
                f, protocol = pickle.HIGHEST_PROTOCOL)
    print('Saved to', os.path.abspath(new_crowded_features_data_store))

Saving data...
Saved to C:\santander-value-prediction-challenge\features\crowded_features_v2_data_store.pkl
Saving data...
Saved to C:\santander-value-prediction-challenge\features\crowded_features_v2_boosted_data_store.pkl
Wall time: 14.8 s


# 4. Modelling

## 4.1 Sections to add models to train

### 4.1.1 LightGBM

In [15]:
def run_lgb(train_X, train_y, val_X, val_y, test_X, seed = RANDOM_STATE):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 361,#40
        'max_depth': 21,
        "learning_rate" : 0.004,#0.005
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.4,#0.6
        "bagging_frequency" : 5,
        "verbosity" : -1,
        'num_threads' : 4,
        "seed": seed
    }
    
    start_time = time.time()
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 10000, 
                      valid_sets=[lgtrain, lgval], 
                      verbose_eval=200, 
                      early_stopping_rounds=200)
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    pred_oof_log = model.predict(val_X, num_iteration=model.best_iteration)
    return pred_test_y, pred_oof_log, model

## 4.2 General function to run models with cv

### 4.2.1 Create custom folds

In [16]:
def get_20_cv_splits(data, output_folder):
    stratify_classes = data.target.apply(lambda x: int(np.log10(x)))
    splits = {}
    for random_state in range(20):
        column = np.zeros(data.shape[0])
        sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=random_state)
        for i, (_, test_index) in enumerate(sss.split(data, stratify_classes)):
            column[test_index] = i

        splits["split{}".format(random_state)] = column

    pd.DataFrame(splits, index=data.index).to_csv(f'{output_folder}cv_splits.csv')

In [17]:
# function to generate 100 folds from create_folds_from_cv_splits func
def create_folds_from_cv_splits(in_path = f'{FOLDS_PATH}cv_splits.csv'):
    
    cv_splits = pd.read_csv(in_path)
    folds_list = []
    for ind, i in enumerate(cv_splits.columns[1:]):
        folds = list(set(cv_splits[i].values))
        folds_list.append([])
        for m in folds:
            val_idx = list(cv_splits[cv_splits[i]==m].index)
            train_idx = list(set(list(cv_splits.index)) - set(val_idx))
            folds_list[ind].append((train_idx, val_idx))
    with open(f'{FOLDS_PATH}custom_cv.pkl', 'wb') as f:
        pickle.dump(folds_list, f)
    return folds_list

### 4.2.2 General logic to train single function

In [18]:
def run_calculations(X, test, big_cv_folds, func_name = None):
    if not func_name:
        return print('The function to run is not defined')
    else:
        y_oof_20_preds = []
        fold_errors_20_preds =[]
        avg_test_pred_20_preds = []
        models_20_preds = []
        
        start_time = time.time()
        for ind, cv_folds in enumerate(big_cv_folds):
#             print('Fitting big fold', ind+1, 'out of', len(big_cv_folds))
            y_oof = np.zeros((y.shape[0]))
            fold_errors =[]
            pred_test_list = []
            models = []
            
            for i, (train_index, val_index) in enumerate(cv_folds):
                print('Fitting big fold', ind+1, 'out of', len(big_cv_folds),
                      'and sub fold', i+1, 'out of', len(cv_folds))
                X_train, X_val  = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y[train_index], y[val_index]

                # part to include additional functions
                if func_name == 'lgb':
                    pred_test_y, pred_oof_log, clf = run_lgb(X_train, y_train, X_val, y_val, test,
                                                             seed = (RANDOM_STATE + ind*i))
                    models.append(clf)
                else:
                    return print('The function to run is not correct')

                y_oof[val_index] = pred_oof_log
                curr_fe = np.sqrt(mean_squared_error(y_val, pred_oof_log))
                print(f'Fold error {curr_fe}')
                fold_errors.append(curr_fe)
                pred_test_list.append(list(pred_test_y))
                print('Time passed: {} seconds.'.format(time.time() - start_time))

            print('Average big fold', ind+1, 'error:', np.sqrt(mean_squared_error(y, y_oof)))
            total_fe_std = round(np.std(fold_errors), 5)
            print(f'Total big fold {ind+1} std {total_fe_std}')
            avg_test_pred = np.mean(pred_test_list, axis=0)
            
            avg_test_pred_20_preds.append(avg_test_pred)
            fold_errors_20_preds.append(fold_errors)
            y_oof_20_preds.append(y_oof)
            models_20_preds.append(models)
            
        return y_oof_20_preds, avg_test_pred_20_preds, fold_errors_20_preds, models_20_preds

## 4.3 Run experiments with different data

### 4.3.1 generate or load custom cv folds

In [19]:
%%time
LOAD_CV = True

if LOAD_CV:
    with open(f'{FOLDS_PATH}custom_cv.pkl', 'rb') as f:
        cv_folds = pickle.load(f)
else:
    get_20_cv_splits(train_df, FOLDS_PATH)
    cv_folds = create_folds_from_cv_splits()

Wall time: 29 ms


### 4.3.2 run models

In [20]:
%%time
y_oof_lgb, avg_test_pred_20, fold_errors, models_all = run_calculations(X[remaining_vars],
                                                                        test[remaining_vars], cv_folds, 'lgb')

Fitting big fold 1 out of 20 and sub fold 1 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.1276	valid_1's rmse: 1.42196
[400]	training's rmse: 0.791611	valid_1's rmse: 1.34043
[600]	training's rmse: 0.582924	valid_1's rmse: 1.32145
[800]	training's rmse: 0.440204	valid_1's rmse: 1.31733
[1000]	training's rmse: 0.336647	valid_1's rmse: 1.31644
[1200]	training's rmse: 0.261889	valid_1's rmse: 1.31598
[1400]	training's rmse: 0.206897	valid_1's rmse: 1.31531
[1600]	training's rmse: 0.165969	valid_1's rmse: 1.31509
Early stopping, best iteration is:
[1571]	training's rmse: 0.171252	valid_1's rmse: 1.31505
Model training done in 98.60400986671448 seconds.
Fold error 1.3150473565745955
Time passed: 105.04500651359558 seconds.
Fitting big fold 1 out of 20 and sub fold 2 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.12361	valid_1's rmse: 1.39054
[400]	training's rmse: 0.788613	valid_1's rmse: 1.

Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.13073	valid_1's rmse: 1.40822
[400]	training's rmse: 0.7957	valid_1's rmse: 1.3275
[600]	training's rmse: 0.587645	valid_1's rmse: 1.30981
[800]	training's rmse: 0.444493	valid_1's rmse: 1.30505
[1000]	training's rmse: 0.341541	valid_1's rmse: 1.30375
[1200]	training's rmse: 0.266516	valid_1's rmse: 1.30404
Early stopping, best iteration is:
[1060]	training's rmse: 0.316802	valid_1's rmse: 1.3037
Model training done in 69.98900651931763 seconds.
Fold error 1.303704560836813
Time passed: 1510.2700905799866 seconds.
Fitting big fold 3 out of 20 and sub fold 2 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.12478	valid_1's rmse: 1.37603
[400]	training's rmse: 0.789754	valid_1's rmse: 1.27618
[600]	training's rmse: 0.583004	valid_1's rmse: 1.25053
[800]	training's rmse: 0.440863	valid_1's rmse: 1.24394
[1000]	training's rmse: 0.340296	valid_1's rmse: 1.24

[400]	training's rmse: 0.796429	valid_1's rmse: 1.34673
[600]	training's rmse: 0.588583	valid_1's rmse: 1.32768
[800]	training's rmse: 0.444596	valid_1's rmse: 1.32331
[1000]	training's rmse: 0.340118	valid_1's rmse: 1.32316
Early stopping, best iteration is:
[885]	training's rmse: 0.39621	valid_1's rmse: 1.32275
Model training done in 62.533103942871094 seconds.
Fold error 1.3227486802961583
Time passed: 2945.592841863632 seconds.
Fitting big fold 5 out of 20 and sub fold 2 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.12378	valid_1's rmse: 1.38441
[400]	training's rmse: 0.788396	valid_1's rmse: 1.29165
[600]	training's rmse: 0.581209	valid_1's rmse: 1.26574
[800]	training's rmse: 0.439794	valid_1's rmse: 1.25854
[1000]	training's rmse: 0.338258	valid_1's rmse: 1.25594
[1200]	training's rmse: 0.266053	valid_1's rmse: 1.25372
[1400]	training's rmse: 0.213765	valid_1's rmse: 1.25251
[1600]	training's rmse: 0.173876	valid_1's rmse: 1.252

Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.13092	valid_1's rmse: 1.40343
[400]	training's rmse: 0.795229	valid_1's rmse: 1.31925
[600]	training's rmse: 0.587303	valid_1's rmse: 1.29977
[800]	training's rmse: 0.444557	valid_1's rmse: 1.29493
[1000]	training's rmse: 0.341066	valid_1's rmse: 1.29317
[1200]	training's rmse: 0.266319	valid_1's rmse: 1.29292
[1400]	training's rmse: 0.212139	valid_1's rmse: 1.29342
Early stopping, best iteration is:
[1203]	training's rmse: 0.265395	valid_1's rmse: 1.29287
Model training done in 79.23400354385376 seconds.
Fold error 1.2928731565279858
Time passed: 4436.455931425095 seconds.
Fitting big fold 7 out of 20 and sub fold 2 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.12219	valid_1's rmse: 1.42413
[400]	training's rmse: 0.788471	valid_1's rmse: 1.32647
[600]	training's rmse: 0.58179	valid_1's rmse: 1.29691
[800]	training's rmse: 0.439823	valid_1's rmse: 1

[1000]	training's rmse: 0.332938	valid_1's rmse: 1.33741
[1200]	training's rmse: 0.257503	valid_1's rmse: 1.33673
[1400]	training's rmse: 0.203008	valid_1's rmse: 1.33689
Early stopping, best iteration is:
[1260]	training's rmse: 0.239562	valid_1's rmse: 1.33658
Model training done in 82.80100727081299 seconds.
Fold error 1.336579411278092
Time passed: 5781.731018066406 seconds.
Fitting big fold 9 out of 20 and sub fold 2 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.12303	valid_1's rmse: 1.40999
[400]	training's rmse: 0.786837	valid_1's rmse: 1.3308
[600]	training's rmse: 0.579154	valid_1's rmse: 1.30993
[800]	training's rmse: 0.438039	valid_1's rmse: 1.30441
[1000]	training's rmse: 0.336819	valid_1's rmse: 1.30184
[1200]	training's rmse: 0.264527	valid_1's rmse: 1.30172
[1400]	training's rmse: 0.212374	valid_1's rmse: 1.30145
[1600]	training's rmse: 0.171349	valid_1's rmse: 1.30109
[1800]	training's rmse: 0.139858	valid_1's rmse: 1.3

[600]	training's rmse: 0.580957	valid_1's rmse: 1.33475
[800]	training's rmse: 0.438946	valid_1's rmse: 1.33
[1000]	training's rmse: 0.335547	valid_1's rmse: 1.32944
Early stopping, best iteration is:
[974]	training's rmse: 0.347283	valid_1's rmse: 1.32927
Model training done in 67.08000707626343 seconds.
Fold error 1.329270216950129
Time passed: 7298.998335599899 seconds.
Fitting big fold 11 out of 20 and sub fold 2 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.12624	valid_1's rmse: 1.35537
[400]	training's rmse: 0.78838	valid_1's rmse: 1.28629
[600]	training's rmse: 0.580459	valid_1's rmse: 1.27484
[800]	training's rmse: 0.43807	valid_1's rmse: 1.27183
[1000]	training's rmse: 0.337015	valid_1's rmse: 1.27091
[1200]	training's rmse: 0.263187	valid_1's rmse: 1.26972
[1400]	training's rmse: 0.210697	valid_1's rmse: 1.26832
[1600]	training's rmse: 0.170795	valid_1's rmse: 1.26739
[1800]	training's rmse: 0.1403	valid_1's rmse: 1.26723
[20

[400]	training's rmse: 0.787314	valid_1's rmse: 1.29916
[600]	training's rmse: 0.58043	valid_1's rmse: 1.27884
[800]	training's rmse: 0.439343	valid_1's rmse: 1.27377
[1000]	training's rmse: 0.338679	valid_1's rmse: 1.27168
[1200]	training's rmse: 0.267088	valid_1's rmse: 1.27053
[1400]	training's rmse: 0.213846	valid_1's rmse: 1.27035
[1600]	training's rmse: 0.173198	valid_1's rmse: 1.27038
Early stopping, best iteration is:
[1547]	training's rmse: 0.182415	valid_1's rmse: 1.27025
Model training done in 148.6240096092224 seconds.
Fold error 1.2702452082417959
Time passed: 8755.20742559433 seconds.
Fitting big fold 13 out of 20 and sub fold 3 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.12181	valid_1's rmse: 1.40523
[400]	training's rmse: 0.785717	valid_1's rmse: 1.33305
[600]	training's rmse: 0.578019	valid_1's rmse: 1.32111
[800]	training's rmse: 0.436439	valid_1's rmse: 1.31935
[1000]	training's rmse: 0.334932	valid_1's rmse: 1.318

[1000]	training's rmse: 0.336034	valid_1's rmse: 1.31402
[1200]	training's rmse: 0.263422	valid_1's rmse: 1.31278
Early stopping, best iteration is:
[1156]	training's rmse: 0.277193	valid_1's rmse: 1.31265
Model training done in 114.75500750541687 seconds.
Fold error 1.3126453699249057
Time passed: 10065.58894777298 seconds.
Fitting big fold 15 out of 20 and sub fold 4 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.12899	valid_1's rmse: 1.37358
[400]	training's rmse: 0.792876	valid_1's rmse: 1.28765
[600]	training's rmse: 0.584771	valid_1's rmse: 1.26909
[800]	training's rmse: 0.442575	valid_1's rmse: 1.2631
[1000]	training's rmse: 0.340199	valid_1's rmse: 1.26238
[1200]	training's rmse: 0.268261	valid_1's rmse: 1.26211
Early stopping, best iteration is:
[1035]	training's rmse: 0.326188	valid_1's rmse: 1.26207
Model training done in 100.65500569343567 seconds.
Fold error 1.2620745294250957
Time passed: 10171.24795126915 seconds.
Fitting

[3000]	training's rmse: 0.0573682	valid_1's rmse: 1.35444
Early stopping, best iteration is:
[2832]	training's rmse: 0.0635257	valid_1's rmse: 1.35437
Model training done in 248.90101528167725 seconds.
Fold error 1.354369857380502
Time passed: 11515.908035039902 seconds.
Fitting big fold 17 out of 20 and sub fold 3 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.11629	valid_1's rmse: 1.46022
[400]	training's rmse: 0.78162	valid_1's rmse: 1.37248
[600]	training's rmse: 0.575449	valid_1's rmse: 1.35173
[800]	training's rmse: 0.43425	valid_1's rmse: 1.34822
[1000]	training's rmse: 0.33332	valid_1's rmse: 1.34799
Early stopping, best iteration is:
[979]	training's rmse: 0.342562	valid_1's rmse: 1.34779
Model training done in 100.05085182189941 seconds.
Fold error 1.3477860604873484
Time passed: 11620.875889539719 seconds.
Fitting big fold 17 out of 20 and sub fold 4 out of 5
Training until validation scores don't improve for 200 rounds.
[200

Model training done in 117.02631855010986 seconds.
Fold error 1.335017397455523
Time passed: 13007.2466776371 seconds.
Fitting big fold 19 out of 20 and sub fold 4 out of 5
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 1.12167	valid_1's rmse: 1.41816
[400]	training's rmse: 0.784793	valid_1's rmse: 1.35102
[600]	training's rmse: 0.578188	valid_1's rmse: 1.33607
[800]	training's rmse: 0.436797	valid_1's rmse: 1.33348
[1000]	training's rmse: 0.336201	valid_1's rmse: 1.33219
[1200]	training's rmse: 0.26301	valid_1's rmse: 1.33158
[1400]	training's rmse: 0.210423	valid_1's rmse: 1.33117
[1600]	training's rmse: 0.169373	valid_1's rmse: 1.33083
[1800]	training's rmse: 0.138198	valid_1's rmse: 1.33059
[2000]	training's rmse: 0.11416	valid_1's rmse: 1.33018
[2200]	training's rmse: 0.0958895	valid_1's rmse: 1.32995
[2400]	training's rmse: 0.0818441	valid_1's rmse: 1.32984
[2600]	training's rmse: 0.0711935	valid_1's rmse: 1.32986
Early stopping, best iterat

In [21]:
%%time
avg_error = np.mean(([np.mean(x) for x in fold_errors]))
avg_std = np.std(([np.mean (x) for x in fold_errors]))
print(f'Average error across 20 folds: {avg_error.round(5)}')
print(f'Average std across 20 folds: {avg_std.round(5)}')
print(f'(averaged std of 20 folds: {np.mean(([np.std (x) for x in fold_errors])).round(5)})')

Average error across 20 folds: 1.3028
Average std across 20 folds: 0.00698
(averaged std of 20 folds: 0.03007)
Wall time: 3.5 ms


In [22]:
len(y_oof_lgb), len(avg_test_pred_20), len(fold_errors)

(20, 20, 20)

In [23]:
print('Length of test predictions:', len(avg_test_pred_20))
avg_pred_test_list_lgb = np.mean(avg_test_pred_20, axis=0)
print('Length of avg test predictions:', len(avg_pred_test_list_lgb))

Length of test predictions: 20
Length of avg test predictions: 49342


In [24]:
pd.DataFrame(fold_errors).to_csv(f'{PRED_TRAIN_PATH}{MODEL_NAME}_fold_errors_cv{avg_error*1000:0>4.0f}_std{avg_std*1000:0>4.0f}.csv',
                                 index=False, header=False)

# 5.Stacking/Blending - TBA

# 6. Save results and prepare model for submitting

In [25]:
%%time
# 20x oof train preds
with open(f'{PRED_TRAIN_PATH}{MODEL_NAME}_20folds_train_cv{avg_error*1000:0>4.0f}_std{avg_std*1000:0>4.0f}.pkl', 'wb') as f:
    pickle.dump(y_oof_lgb, f)
    
#20x test preds
with open(f'{PRED_TRAIN_PATH}{MODEL_NAME}_20folds_test_cv{avg_error*1000:0>4.0f}_std{avg_std*1000:0>4.0f}.pkl', 'wb') as f:
    pickle.dump(avg_test_pred_20, f)

#20x5 models
# with open(f'{PRED_TRAIN_PATH}{MODEL_NAME}_20folds_models_cv{avg_error*1000:0>4.0f}_std{avg_std*1000:0>4.0f}.pkl', 'wb') as f:
#     pickle.dump(models_all, f)

Wall time: 26 ms


In [26]:
%%time
sub_df = pd.read_csv(f'{DATA_PATH}sample_submission.csv')#, usecols = ['ID']
model_to_submit = pd.DataFrame({ 'ID': sub_df['ID'].values,
                            'target': avg_pred_test_list_lgb})
model_to_submit.to_csv(f'{PRED_TEST_PATH}{MODEL_NAME}_20folds_test_averaged_cv{avg_error*1000:0>4.0f}_std{avg_std*1000:0>4.0f}.csv',
                       index=False)

Wall time: 226 ms
