In [1]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
import statistics

import numpy as np
import pandas as pd
import gc

import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import pearsonr

from skimpy import skim

In [27]:
from sklearn.model_selection import train_test_split, KFold, TimeSeriesSplit

In [4]:
import lightgbm as lgb

In [5]:
xtrain = pd.read_parquet("data/train_low_mem.parquet").sample(frac=0.1)
xtrain.head(10)

Unnamed: 0,row_id,time_id,investment_id,target,f_0,f_1,f_2,f_3,f_4,f_5,...,f_290,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299
184006,80_3707,80,3707,-0.716359,-1.485398,0.888295,-1.036632,1.04093,-0.523327,-2.045977,...,-0.000617,-0.67039,0.577094,1.716897,-0.91115,-0.504205,-1.05242,1.932024,2.229024,-0.410322
2152217,909_3065,909,3065,-2.489647,0.240521,1.800758,-0.969943,2.252556,-0.24825,1.008099,...,-2.396864,1.850617,-0.089411,1.398223,1.513186,1.658899,-0.301949,2.156229,1.622897,-0.590806
1986380,853_3458,853,3458,-0.484678,0.25083,-1.441274,-0.285294,-0.505872,-0.078618,-0.618547,...,-0.959207,2.393993,-0.744012,-2.249607,0.231,-0.067536,-1.090453,0.788107,1.043107,-0.332152
2939364,1159_2129,1159,2129,-0.87695,-2.358495,1.090946,-0.483831,-0.49383,-0.104828,2.090999,...,0.390627,-0.924926,-0.493301,0.271502,0.983961,-0.191658,1.307316,1.644917,-0.094997,-0.277184
2335003,968_2618,968,2618,-2.447303,0.134645,0.286206,-2.012456,1.62757,0.084117,0.322417,...,-0.485044,1.569574,-1.537639,1.980731,1.268298,1.260458,-0.278722,2.241691,-0.225564,0.223694
703299,313_2088,313,2088,-0.316797,-0.523676,0.351873,-0.425329,-0.853535,-0.056294,-0.001557,...,1.321556,1.090489,0.195078,0.544615,0.709029,-0.401945,-0.979983,-0.338732,0.354642,-0.258514
848232,396_3671,396,3671,1.109364,-0.280327,-0.53867,1.270649,-0.259013,-0.607388,-0.027482,...,-1.344543,1.04903,-0.26941,0.148936,0.810742,-0.819713,-1.265566,1.013334,-0.406384,-0.634967
643452,286_558,286,558,2.723757,0.0,-0.350994,-0.799045,1.268173,-0.316717,-0.434571,...,-1.31802,1.265708,-0.792685,0.121929,-0.831409,-0.087748,-0.263308,0.794776,-0.647565,-0.56417
1753731,772_933,772,933,0.278001,0.875139,0.255808,2.358301,-0.337233,-0.494942,0.452321,...,0.417376,-0.727061,-0.367297,0.324696,-0.956083,-0.678773,0.978413,-0.501697,-0.77864,-0.619525
2278442,950_2310,950,2310,0.129507,0.59237,-0.676463,0.643829,-0.380092,-0.238719,0.045468,...,-0.630003,-0.652828,-0.28287,0.295061,-0.83132,-0.537811,1.036171,-0.072299,-0.046079,-0.384937


In [6]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [7]:
xtrain['time_id'].min(), xtrain['time_id'].max()

(0, 1219)

In [8]:
xtest = xtrain.loc[xtrain.time_id > 1_100].copy().reset_index(drop=True)
xtrain = xtrain.loc[xtrain.time_id <= 1_100].copy().reset_index(drop=True)
print(xtrain.shape, xtest.shape)

(274443, 304) (39698, 304)


In [9]:
id_train, id_test = xtrain['row_id'].copy(), xtest['row_id'].copy()
xtrain.drop('row_id', axis=1, inplace=True)
xtest.drop('row_id', axis=1, inplace=True)

ytrain, ytest = xtrain['target'].copy(), xtest['target'].copy()
inv_train, inv_test = xtrain['investment_id'].copy(), xtest['investment_id'].copy()
time_train, time_test = xtrain['time_id'].copy(), xtest['time_id'].copy()

xtrain.drop(['time_id', 'investment_id', 'target'], axis=1, inplace=True)
xtest.drop(['time_id', 'investment_id', 'target'], axis=1, inplace=True)

In [10]:
xtrain = reduce_mem_usage(xtrain)
xtest = reduce_mem_usage(xtest)

gc.collect()

Memory usage of dataframe is 314.08 MB
Memory usage after optimization is: 157.04 MB
Decreased by 50.0%
Memory usage of dataframe is 45.43 MB
Memory usage after optimization is: 22.72 MB
Decreased by 50.0%


0

In [11]:
x0, x1, y0, y1 = train_test_split(xtrain, ytrain, test_size=0.33, random_state=42)

lgb_parameters = {'objective': 'regression', 'metric': 'rmse', 'num_iterations': 150, 
                  'num_leaves': 32, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.6}

model = lgb.LGBMRegressor(**lgb_parameters)

In [12]:
model.fit(x0, y0, eval_metric='rmse', eval_set=[(x0, y0), (x1, y1)], verbose=250, early_stopping_rounds=100)
val_preds = model.predict(x1)

In [13]:
# validation score    
score = pearsonr(val_preds, y1)[0]
print(f"validation score: {score:.4f}")

# actual test performance
test_preds = model.predict(xtest)
score = pearsonr(test_preds, ytest)[0]
print(f"test score: {score:.4f}")


del x0, x1, y0, y1; gc.collect()

validation score: 0.1368
test score: 0.1050


4

In [24]:
kf = KFold(n_splits=10, shuffle=True)
val_scores = []
preds = np.zeros((xtest.shape[0], 10))


for (i, (train_idx, test_idx)) in enumerate(kf.split(xtrain)):
    print(f'fold: {i}')
    
    X_train, X_test = xtrain.loc[train_idx], xtrain.loc[test_idx]
    y_train, y_test = ytrain.loc[train_idx], ytrain.loc[test_idx]
    
    model = lgb.LGBMRegressor(**lgb_parameters)
    
    model.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=100, verbose=False)
    model.predict(X_test)
    
    # validation score    
    score = pearsonr(model.predict(X_test), y_test)[0]
    val_scores.append(score)
    print(f"validation score: {score:.4f}")

    # actual test performance
    pred = model.predict(xtest)
    preds[:, i] = pred
    score = pearsonr(pred, ytest)[0]
    print(f"test score: {score:.4f}")


    del model, X_train, X_test, y_train, y_test; gc.collect()

fold: 0
validation score: 0.1506
test score: 0.1181
fold: 1
validation score: 0.1430
test score: 0.1091
fold: 2
validation score: 0.1278
test score: 0.1061
fold: 3
validation score: 0.1562
test score: 0.1208
fold: 4
validation score: 0.1384
test score: 0.1099
fold: 5
validation score: 0.1436
test score: 0.1117
fold: 6
validation score: 0.1291
test score: 0.1110
fold: 7
validation score: 0.1403
test score: 0.1087
fold: 8
validation score: 0.1326
test score: 0.1106
fold: 9
validation score: 0.1370
test score: 0.1106


In [25]:
statistics.mean(val_scores)

0.13984889071283133

In [26]:
pearsonr(preds.mean(axis=1), ytest)[0]

0.1329938089514993

In [28]:
tscv = TimeSeriesSplit(n_splits=10)

val_scores = []
preds = np.zeros((xtest.shape[0], 10))


for (i, (train_idx, test_idx)) in enumerate(tscv.split(xtrain)):
    print(f'fold: {i}')
    
    X_train, X_test = xtrain.loc[train_idx], xtrain.loc[test_idx]
    y_train, y_test = ytrain.loc[train_idx], ytrain.loc[test_idx]
    
    model = lgb.LGBMRegressor(**lgb_parameters)
    
    model.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=100, verbose=False)
    model.predict(X_test)
    
    # validation score    
    score = pearsonr(model.predict(X_test), y_test)[0]
    val_scores.append(score)
    print(f"validation score: {score:.4f}")

    # actual test performance
    pred = model.predict(xtest)
    preds[:, i] = pred
    score = pearsonr(pred, ytest)[0]
    print(f"test score: {score:.4f}")


    del model, X_train, X_test, y_train, y_test; gc.collect()

fold: 0
validation score: 0.0925
test score: 0.0822
fold: 1
validation score: 0.1031
test score: 0.0960
fold: 2
validation score: 0.1022
test score: 0.0899
fold: 3
validation score: 0.1208
test score: 0.0960
fold: 4
validation score: 0.1319
test score: 0.1045
fold: 5
validation score: 0.1192
test score: 0.1039
fold: 6
validation score: 0.1358
test score: 0.0922
fold: 7
validation score: 0.1386
test score: 0.1078
fold: 8
validation score: 0.1302
test score: 0.0996
fold: 9
validation score: 0.1391
test score: 0.1099


In [29]:
statistics.mean(val_scores)

0.1213296037330278

In [30]:
pearsonr(preds.mean(axis=1), ytest)[0]

0.1266135736097533

In [31]:
tscv = TimeSeriesSplit(n_splits=10, max_train_size=xtest.shape[0] * 2)

val_scores = []
preds = np.zeros((xtest.shape[0], 10))


for (i, (train_idx, test_idx)) in enumerate(tscv.split(xtrain)):
    print(f'fold: {i}')
    
    X_train, X_test = xtrain.loc[train_idx], xtrain.loc[test_idx]
    y_train, y_test = ytrain.loc[train_idx], ytrain.loc[test_idx]
    
    model = lgb.LGBMRegressor(**lgb_parameters)
    
    model.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=100, verbose=False)
    model.predict(X_test)
    
    # validation score    
    score = pearsonr(model.predict(X_test), y_test)[0]
    val_scores.append(score)
    print(f"validation score: {score:.4f}")

    # actual test performance
    pred = model.predict(xtest)
    preds[:, i] = pred
    score = pearsonr(pred, ytest)[0]
    print(f"test score: {score:.4f}")


    del model, X_train, X_test, y_train, y_test; gc.collect()

fold: 0
validation score: 0.0925
test score: 0.0822
fold: 1
validation score: 0.1031
test score: 0.0960
fold: 2
validation score: 0.1022
test score: 0.0899
fold: 3
validation score: 0.1164
test score: 0.1017
fold: 4
validation score: 0.1082
test score: 0.0848
fold: 5
validation score: 0.0967
test score: 0.0984
fold: 6
validation score: 0.1063
test score: 0.0777
fold: 7
validation score: 0.1216
test score: 0.0864
fold: 8
validation score: 0.1070
test score: 0.0986
fold: 9
validation score: 0.1077
test score: 0.0886


In [32]:
statistics.mean(val_scores)

0.10617067563712244

In [33]:
pearsonr(preds.mean(axis=1), ytest)[0]

0.12722595962714603

In [34]:
# Taken from the notebo
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class GroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_size : int, default=None
        Maximum size for a single training set.
    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.model_selection import GroupTimeSeriesSplit
    >>> groups = np.array(['a', 'a', 'a', 'a', 'a', 'a',\
                           'b', 'b', 'b', 'b', 'b',\
                           'c', 'c', 'c', 'c',\
                           'd', 'd', 'd'])
    >>> gtss = GroupTimeSeriesSplit(n_splits=3)
    >>> for train_idx, test_idx in gtss.split(groups, groups=groups):
    ...     print("TRAIN:", train_idx, "TEST:", test_idx)
    ...     print("TRAIN GROUP:", groups[train_idx],\
                  "TEST GROUP:", groups[test_idx])
    TRAIN: [0, 1, 2, 3, 4, 5] TEST: [6, 7, 8, 9, 10]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a']\
    TEST GROUP: ['b' 'b' 'b' 'b' 'b']
    TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] TEST: [11, 12, 13, 14]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b']\
    TEST GROUP: ['c' 'c' 'c' 'c']
    TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\
    TEST: [15, 16, 17]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b' 'c' 'c' 'c' 'c']\
    TEST GROUP: ['d' 'd' 'd']
    """
    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_size=None
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_size = max_train_size

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
        group_test_size = n_groups // n_folds
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []
            for train_group_idx in unique_groups[:group_test_start]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
            train_end = train_array.size
            if self.max_train_size and self.max_train_size < train_end:
                train_array = train_array[train_end -
                                          self.max_train_size:train_end]
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [36]:
gtss = GroupTimeSeriesSplit(n_splits=10)

val_scores = []
preds = np.zeros((xtest.shape[0], 10))


for (i, (train_idx, test_idx)) in enumerate(gtss.split(xtrain, groups=pd.DataFrame(time_train)['time_id'])):
    print(f'fold: {i}')
    
    X_train, X_test = xtrain.loc[train_idx], xtrain.loc[test_idx]
    y_train, y_test = ytrain.loc[train_idx], ytrain.loc[test_idx]
    
    model = lgb.LGBMRegressor(**lgb_parameters)
    
    model.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=100, verbose=False)
    model.predict(X_test)
    
    # validation score    
    score = pearsonr(model.predict(X_test), y_test)[0]
    val_scores.append(score)
    print(f"validation score: {score:.4f}")

    # actual test performance
    pred = model.predict(xtest)
    preds[:, i] = pred
    score = pearsonr(pred, ytest)[0]
    print(f"test score: {score:.4f}")


    del model, X_train, X_test, y_train, y_test; gc.collect()

fold: 0
validation score: 0.0530
test score: 0.0749
fold: 1
validation score: 0.0958
test score: 0.0854
fold: 2
validation score: 0.1167
test score: 0.0832
fold: 3
validation score: 0.0999
test score: 0.0893
fold: 4
validation score: 0.1260
test score: 0.0945
fold: 5
validation score: 0.0999
test score: 0.0985
fold: 6
validation score: 0.1175
test score: 0.1002
fold: 7
validation score: 0.1263
test score: 0.1149
fold: 8
validation score: 0.1407
test score: 0.1090
fold: 9
validation score: 0.1229
test score: 0.1212


In [37]:
statistics.mean(val_scores)

0.10987748460714743

In [38]:
pearsonr(preds.mean(axis=1), ytest)[0]

0.12679121038839772

In [39]:
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [41]:
cv = PurgedGroupTimeSeriesSplit(n_splits=10, max_train_group_size=15, group_gap=5, max_test_group_size=5)

val_scores = []
preds = np.zeros((xtest.shape[0], 10))


for (i, (train_idx, test_idx)) in enumerate(cv.split(xtrain, groups=pd.DataFrame(time_train)['time_id'])):
    print(f'fold: {i}')
    
    X_train, X_test = xtrain.loc[train_idx], xtrain.loc[test_idx]
    y_train, y_test = ytrain.loc[train_idx], ytrain.loc[test_idx]
    
    model = lgb.LGBMRegressor(**lgb_parameters)
    
    model.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=100, verbose=False)
    model.predict(X_test)
    
    # validation score    
    score = pearsonr(model.predict(X_test), y_test)[0]
    val_scores.append(score)
    print(f"validation score: {score:.4f}")

    # actual test performance
    pred = model.predict(xtest)
    preds[:, i] = pred
    score = pearsonr(pred, ytest)[0]
    print(f"test score: {score:.4f}")


    del model, X_train, X_test, y_train, y_test; gc.collect()

fold: 0
validation score: -0.0455
test score: 0.0558
fold: 1
validation score: -0.0089
test score: 0.0282
fold: 2
validation score: 0.1164
test score: 0.0098
fold: 3
validation score: -0.0370
test score: 0.0137
fold: 4
validation score: 0.0844
test score: -0.0011
fold: 5
validation score: 0.0591
test score: -0.0160
fold: 6
validation score: 0.1007
test score: 0.0176
fold: 7
validation score: 0.0415
test score: 0.0066
fold: 8
validation score: -0.0500
test score: 0.0058
fold: 9
validation score: 0.1552
test score: 0.0115


In [42]:
statistics.mean(val_scores)

0.04159736352142262

In [43]:
pearsonr(preds.mean(axis=1), ytest)[0]

0.02297605246264159

In [44]:
from scipy.special import comb
from itertools import combinations

class CombinatorialPurgedGroupKFold():
    def __init__(self, n_splits = 6, n_test_splits = 2, purge = 1, pctEmbargo = 0.01, **kwargs):
        self.n_splits = n_splits
        self.n_test_splits = n_test_splits
        self.purge = purge
        self.pctEmbargo = pctEmbargo
        
    def split(self, X, y = None, groups = None):
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
            
        u, ind = np.unique(groups, return_index = True)
        unique_groups = u[np.argsort(ind)]
        n_groups = len(unique_groups)
        group_dict = {}
        for idx in range(len(X)):
            if groups[idx] in group_dict:
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
                
        n_folds = comb(self.n_splits, self.n_test_splits, exact = True)
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
            
        mbrg = int(n_groups * self.pctEmbargo)
        if mbrg < 0:
            raise ValueError(
                "The number of 'embargoed' groups should not be negative")
        
        split_dict = {}
        group_test_size = n_groups // self.n_splits
        for split in range(self.n_splits):
            if split == self.n_splits - 1:
                split_dict[split] = unique_groups[int(split * group_test_size):].tolist()
            else:
                split_dict[split] = unique_groups[int(split * group_test_size):int((split + 1) * group_test_size)].tolist()
        
        for test_splits in combinations(range(self.n_splits), self.n_test_splits):
            test_groups = []
            banned_groups = []
            for split in test_splits:
                test_groups += split_dict[split]
                banned_groups += unique_groups[split_dict[split][0] - self.purge:split_dict[split][0]].tolist()
                banned_groups += unique_groups[split_dict[split][-1] + 1:split_dict[split][-1] + self.purge + mbrg + 1].tolist()
            train_groups = [i for i in unique_groups if (i not in banned_groups) and (i not in test_groups)]

            train_idx = []
            test_idx = []
            for train_group in train_groups:
                train_idx += group_dict[train_group]
            for test_group in test_groups:
                test_idx += group_dict[test_group]
            yield train_idx, test_idx


In [46]:
cpgf = CombinatorialPurgedGroupKFold(10, 1)

val_scores = []
preds = np.zeros((xtest.shape[0], 10))


for (i, (train_idx, test_idx)) in enumerate(cpgf.split(xtrain, groups=pd.DataFrame(time_train)['time_id'])):
    print(f'fold: {i}')
    
    X_train, X_test = xtrain.loc[train_idx], xtrain.loc[test_idx]
    y_train, y_test = ytrain.loc[train_idx], ytrain.loc[test_idx]
    
    model = lgb.LGBMRegressor(**lgb_parameters)
    
    model.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=100, verbose=False)
    model.predict(X_test)
    
    # validation score    
    score = pearsonr(model.predict(X_test), y_test)[0]
    val_scores.append(score)
    print(f"validation score: {score:.4f}")

    # actual test performance
    pred = model.predict(xtest)
    preds[:, i] = pred
    score = pearsonr(pred, ytest)[0]
    print(f"test score: {score:.4f}")


    del model, X_train, X_test, y_train, y_test; gc.collect()

fold: 0
validation score: 0.1147
test score: 0.1065
fold: 1
validation score: 0.1248
test score: 0.1130
fold: 2
validation score: 0.1364
test score: 0.1147
fold: 3
validation score: 0.1109
test score: 0.1103
fold: 4
validation score: 0.1370
test score: 0.1061
fold: 5
validation score: 0.1232
test score: 0.1115
fold: 6
validation score: 0.1171
test score: 0.1249
fold: 7
validation score: 0.1162
test score: 0.1109
fold: 8
validation score: 0.1295
test score: 0.1254
fold: 9
validation score: 0.1252
test score: 0.1080


In [47]:
statistics.mean(val_scores)

0.12350272206178159

In [48]:
pearsonr(preds.mean(axis=1), ytest)[0]

0.1347928785151134