In [1]:
import os
import gc
import time

import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [2]:
PATH_TO_DATA = ('D:/Py/DataFrames/Santander_Value_Prediction_Challenge(KAGGLE)/')

## Reading and joining data

In [3]:
y = pd.read_csv(os.path.join(PATH_TO_DATA, 'input/train.csv'), usecols=['target'])
y = np.log1p(y.target.values)
print('Target length:', len(y))

Target length: 4459


In [4]:
%%time
train_df_statistic_and_bin = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_with_row_statistic_and_bin_thresh098.csv'))
test_df_statistic_and_bin = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_with_row_statistic_and_bin_thresh098.csv'))

train_space_reduction = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_space_reduction_thresh098.csv'))
test_space_reduction = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_space_reduction_thresh098.csv'))

Wall time: 33.1 s


In [5]:
# Use only FA and SRP features
cols_to_drop = [x for x in train_space_reduction.columns if str(x)[:2] not in ['fa', 'sr']]
print('Drop', len(cols_to_drop), 'columns')
train_space_reduction = train_space_reduction.drop(cols_to_drop, axis=1, inplace=True)
test_space_reduction = test_space_reduction.drop(cols_to_drop, axis=1, inplace=True)

Drop 600 columns


In [6]:
%%time
train_df = pd.concat([train_df_statistic_and_bin, train_space_reduction], axis=1)
test_df = pd.concat([test_df_statistic_and_bin, test_space_reduction], axis=1)

del train_df_statistic_and_bin, test_df_statistic_and_bin
del train_space_reduction, test_space_reduction
gc.collect()

print('Train:', train_df.shape)
print('Test:', test_df.shape)

Train: (4459, 3370)
Test: (49342, 3370)
Wall time: 888 ms


## XGBoost + LightGBM + CV Folds

In [15]:
def run_xgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        'objective': 'reg:linear',
        'booster': 'gbtree',
        'metric': 'rmse',
        'learning_rate': 0.005,
        'max_depth': 8,
        'subsample': 0.7,
        'colsample_bytree': 0.1,
        'alpha':0,
        'lambda': 0,
        'gamma': 1.5,
        'silent': True,
        'random_state': 44,
        'nthread': -1
    }
    
    start_time = time.time()
    xgtrain = xgb.DMatrix(train_X, label=train_y)
    xgval = xgb.DMatrix(val_X, label=val_y)
    xgtest = xgb.DMatrix(test_X)
    watchlist = [(xgtrain, 'train'), (xgval, 'valid')]
    model = xgb.train(params, xgtrain, 2000, watchlist, 
                      early_stopping_rounds=50, 
                      verbose_eval=100)
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_y = np.expm1(model.predict(xgtest, ntree_limit=model.best_ntree_limit))
    pred_oof_log = model.predict(xgval, ntree_limit=model.best_ntree_limit)
    return pred_test_y, pred_oof_log, model

In [16]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        'max_depth': 8,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.1, #0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 44,
        "verbosity" : -1,
        'num_threads' : 4,
        "seed": 44
    }
    
    start_time = time.time()
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 5000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=150)
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    pred_oof_log = model.predict(val_X, num_iteration=model.best_iteration)
    
    return pred_test_y, pred_oof_log, model

In [8]:
def run_calculations(X, test, func_name = None):
    if not func_name:
        return print('The function to run is not defined')
    else:
        n_splits = 5
        random_state = 44

        y_oof = np.zeros((y.shape[0]))
        fold_errors =[]
        pred_test_list = []
        

        kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        for i, (train_index, val_index) in enumerate(kf.split(y)):
            print('Fitting fold', i+1, 'out of', n_splits)
            X_train, X_val  = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y[train_index], y[val_index]
            if func_name == 'lgb':
                pred_test_y, pred_oof_log, clf = run_lgb(X_train, y_train, X_val, y_val, test)
            elif func_name == 'xgb':
                pred_test_y, pred_oof_log, clf = run_xgb(X_train, y_train, X_val, y_val, test)
            elif func_name == 'cat':
                pred_test_y, pred_oof_log, clf = run_cat(X_train, y_train, X_val, y_val, test)
            else:
                return print('The function to run is not correct')
                
            y_oof[val_index] = pred_oof_log
            curr_fe = np.sqrt(mean_squared_error(y_val, pred_oof_log))
            print(f'Fold error {curr_fe}')
            fold_errors.append(curr_fe)
            pred_test_list.append(list(pred_test_y))
            
        print('Total error', np.sqrt(mean_squared_error(y, y_oof)))
        total_fe_std = round(np.std(fold_errors), 5)
        print(f'Total std {total_fe_std}')
        avg_test_pred = np.mean(pred_test_list, axis=0)
        
        return y_oof, avg_test_pred

In [29]:
%%time
y_oof_lgb, pred_test_list_lgb = run_calculations(train_df, test_df, 'lgb')

Fitting fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.42978	valid_1's rmse: 1.43401
[300]	training's rmse: 1.27366	valid_1's rmse: 1.33583
[450]	training's rmse: 1.18603	valid_1's rmse: 1.30047
[600]	training's rmse: 1.1261	valid_1's rmse: 1.28624
[750]	training's rmse: 1.08592	valid_1's rmse: 1.28166
[900]	training's rmse: 1.05535	valid_1's rmse: 1.28031
[1050]	training's rmse: 1.03167	valid_1's rmse: 1.28086
Early stopping, best iteration is:
[957]	training's rmse: 1.04527	valid_1's rmse: 1.28005
Model training done in 28.302698373794556 seconds.
Fold error 1.280050628941497
Fitting fold 2 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.41304	valid_1's rmse: 1.49962
[300]	training's rmse: 1.25608	valid_1's rmse: 1.41352
[450]	training's rmse: 1.17101	valid_1's rmse: 1.38379
[600]	training's rmse: 1.11253	valid_1's rmse: 1.373
[750]	training's rmse: 1.07265	valid_1's rmse: 1.3695