<a href="https://www.kaggle.com/code/lonnieqin/ump-tf-record-combinatorialpurgedgroupkfold?scriptVersionId=115161173" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

#  UMP TF-Record: CombinatorialPurgedGroupKFold

In this notebook, I am going to create TF-Record for UMP dataset using CombinatorialPurgedGroupKFold CV strategy.

In [None]:
import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import tensorflow as tf
import json
import numpy as np
from scipy.special import comb
from itertools import combinations

class CombinatorialPurgedGroupKFold():
    def __init__(self, n_splits = 6, n_test_splits = 2, purge = 1, pctEmbargo = 0.01, **kwargs):
        self.n_splits = n_splits
        self.n_test_splits = n_test_splits
        self.purge = purge
        self.pctEmbargo = pctEmbargo
        
    def split(self, X, y = None, groups = None):
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
            
        u, ind = np.unique(groups, return_index = True)
        unique_groups = u[np.argsort(ind)]
        n_groups = len(unique_groups)
        group_dict = {}
        for idx in range(len(X)):
            if groups[idx] in group_dict:
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
                
        n_folds = comb(self.n_splits, self.n_test_splits, exact = True)
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
            
        mbrg = int(n_groups * self.pctEmbargo)
        if mbrg < 0:
            raise ValueError(
                "The number of 'embargoed' groups should not be negative")
        
        split_dict = {}
        group_test_size = n_groups // self.n_splits
        for split in range(self.n_splits):
            if split == self.n_splits - 1:
                split_dict[split] = unique_groups[int(split * group_test_size):].tolist()
            else:
                split_dict[split] = unique_groups[int(split * group_test_size):int((split + 1) * group_test_size)].tolist()
        
        for test_splits in combinations(range(self.n_splits), self.n_test_splits):
            test_groups = []
            banned_groups = []
            for split in test_splits:
                test_groups += split_dict[split]
                banned_groups += unique_groups[split_dict[split][0] - self.purge:split_dict[split][0]].tolist()
                banned_groups += unique_groups[split_dict[split][-1] + 1:split_dict[split][-1] + self.purge + mbrg + 1].tolist()
            train_groups = [i for i in unique_groups if (i not in banned_groups) and (i not in test_groups)]

            train_idx = []
            test_idx = []
            for train_group in train_groups:
                train_idx += group_dict[train_group]
            for test_group in test_groups:
                test_idx += group_dict[test_group]
            yield train_idx, test_idx

Let's use a small sample data to understand this CV strategy.

In [None]:
n_splits = 6
n_test_splits = 2
elements = list(range(10 * (n_splits + n_test_splits)))
groups = [element // n_splits for element in elements]
data = pd.DataFrame({"group": groups, "element": elements})
kfold = CombinatorialPurgedGroupKFold(n_splits, n_test_splits)
for index, (train_indices, test_indices) in enumerate(kfold.split(data, groups=data["group"])):
    print("=" * 100)
    print(f"Fold {index}")
    print("=" * 100)
    print("Train indices:", train_indices, "Length:", len(train_indices))
    print("Test Indices:", test_indices, "Length:", len(test_indices))

## 

## Import dataset

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train.head()

In [None]:
investment_id = train.pop("investment_id")
investment_id.head()

In [None]:
time_id = train.pop("time_id")

In [None]:
y = train.pop("target")
y.head()

## Create TF-Record

In [None]:
def create_record(i):
    dic = {}
    dic[f"features"] = tf.train.Feature(float_list=tf.train.FloatList(value=list(train.iloc[i])))
    dic["time_id"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[time_id.iloc[i]]))
    dic["investment_id"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[investment_id.iloc[i]]))
    dic["target"] = tf.train.Feature(float_list=tf.train.FloatList(value=[y.iloc[i]]))
    record_bytes = tf.train.Example(features=tf.train.Features(feature=dic)).SerializeToString()
    return record_bytes
    
def decode_function(record_bytes):
  return tf.io.parse_single_example(
      # Data
      record_bytes,
      # Schema
      {
          "features": tf.io.FixedLenFeature([300], dtype=tf.float32),
          "time_id": tf.io.FixedLenFeature([], dtype=tf.int64),
          "investment_id": tf.io.FixedLenFeature([], dtype=tf.int64),
          "target": tf.io.FixedLenFeature([], dtype=tf.float32)
      }
  )

Now create the whole dataset, it will take a long time.

In [None]:
%%time
import time
n_splits = 5
n_test_splits = 1
kfold = CombinatorialPurgedGroupKFold(n_splits, n_test_splits)
for fold, (train_indices, test_indices) in enumerate(kfold.split(train, groups=time_id)):
    print("=" * 100)
    print(f"Fold {fold}")
    print("=" * 100)
    print("Train Sample size:", len(train_indices))
    print("Test Sample size:", len(test_indices))
    train_save_path = f"fold_{fold}_train.tfrecords"
    begin = time.time()
    print(f"Creating {train_save_path}")
    with tf.io.TFRecordWriter(train_save_path) as file_writer:
        for i in train_indices:
            file_writer.write(create_record(i))
    print("Elapsed time: %.2f"%(time.time() - begin))
    begin = time.time()
    print(f"Creating {train_save_path}")
    test_save_path = f"fold_{fold}_test.tfrecords"
    with tf.io.TFRecordWriter(test_save_path) as file_writer:
        for i in test_indices:
            file_writer.write(create_record(i))
    print("Elapsed time: %.2f"%(time.time() - begin))

## Write unique Investment Ids

In [None]:
investment_ids = investment_id.unique()
investment_id_df = pd.DataFrame({"investment_id": investment_ids})
investment_id_df.to_csv("investment_ids.csv", index=False)