In [1]:
import itertools
import os
import random
import warnings
from dataclasses import dataclass

import lightgbm as lgb
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, train_test_split

In [38]:
class AdvancedKfold:
    def __init__(self, n_splits, random_state):
        self.n_splits = n_splits
        random.seed(random_state)
        self.past_train_indicies = []
        self.past_eval_indicies = []
        self.past_valid_indicies = []

    def split(self, X: pd.DataFrame):
        indicies = list(X.index)
        n_sample = int(len(indicies) / self.n_splits)
        for i in range(self.n_splits - 1):
            # 抽出
            eval_indicies = random.sample(list(set(indicies) - set(self.past_eval_indicies)), n_sample)
            valid_indicies = random.sample(list(set(indicies) - set(eval_indicies) - set(self.past_valid_indicies)), n_sample)
            train_indicies = list(set(indicies) - set(eval_indicies) - set(valid_indicies))
            # 格納
            self.past_eval_indicies += eval_indicies
            self.past_valid_indicies += valid_indicies
            yield train_indicies, eval_indicies, valid_indicies
        eval_indicies = list(set(indicies) - set(self.past_eval_indicies))
        valid_indicies = list(set(indicies) - set(eval_indicies) - set(self.past_valid_indicies))
        train_indicies = list(set(indicies) - set(eval_indicies) - set(valid_indicies))
        yield train_indicies, eval_indicies, valid_indicies

In [62]:
class AdvancedKfold:
    def __init__(self, n_splits, random_state):
        self.n_splits = n_splits
        random.seed(random_state)

    def split(self, X: pd.DataFrame):
        indicies = list(X.index)
        indicies = random.sample(indicies, len(indicies))
        n_sample = int(len(indicies) / self.n_splits)
        for i in range(self.n_splits):
            # 抽出
            eval_indicies = indicies[n_sample*i: n_sample*(i+1)]
            if n_sample*(i+2) <= len(indicies):
                valid_indicies = indicies[n_sample*(i+1): n_sample*(i+2)]
            else: 
                valid_indicies = indicies[0: n_sample]
            train_indicies = list(set(indicies) - set(eval_indicies) - set(valid_indicies))
            yield train_indicies, eval_indicies, valid_indicies

In [63]:
advanced_kfold = AdvancedKfold(n_splits=5, random_state=42)

In [64]:
b_list = []
c_list = []
cnt = 0
for a, b, c in advanced_kfold.split(pd.DataFrame(index=np.arange(0, 100, 1))):
    b_list += b
    c_list += c
    print(cnt)
    if len(set(a) & set(b)) >= 1:
        print("a & b Alert")
    if len(set(b) & set(c)) >= 1:
        print("b & c Alert")
    if len(set(c) & set(a)) >= 1:
        print("c & a Alert")
    cnt+=1

0
1
2
3
4


In [65]:
print(len(set(b_list)))
print(len(b_list))

100
100


In [66]:
print(len(set(c_list)))
print(len(c_list))

100
100
