In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import model_selection

In [3]:
def create_folds(data):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)
    # calculate the number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    # drop the bins column
    data = data.drop("bins", axis=1)
    # return dataframe with folds
    return data

In [4]:
X, y = datasets.make_regression(
    n_samples=15000, n_features=100, n_targets=1
)

In [8]:
X.shape

(15000, 100)

In [9]:
y.shape

(15000,)

In [10]:
y

array([ 67.57168817,  44.42726441, 174.08971818, ...,  19.20917085,
        78.64979829,  -6.05956724])

In [12]:
X

array([[-1.44642561,  0.47030435, -0.09003682, ..., -0.44975282,
        -0.38799317,  0.71443465],
       [ 0.75888724,  0.38339973, -0.75570859, ..., -0.29125749,
        -0.20038862,  0.13853431],
       [-0.12483014,  0.83169375,  1.30082059, ...,  1.15710447,
         1.06452664,  0.79880716],
       ...,
       [ 0.79838296, -2.35116457, -0.05766509, ..., -0.3585615 ,
         0.27910799,  0.91506944],
       [ 0.3007026 ,  0.15945163, -1.4955461 , ...,  0.23985146,
         0.16722646, -0.10338958],
       [ 0.27389177,  2.00356018,  0.4558802 , ...,  0.77750266,
        -1.23372357, -0.41975036]])

In [13]:
df = pd.DataFrame(
    X,
    columns=[f"f_{i}" for i in range(X.shape[1])]
)

In [14]:
df

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_90,f_91,f_92,f_93,f_94,f_95,f_96,f_97,f_98,f_99
0,-1.446426,0.470304,-0.090037,1.372448,0.175086,-0.259279,0.372518,-1.249224,-0.204630,1.003942,...,1.080715,0.150155,-0.513067,0.733901,0.212853,-0.006295,0.003343,-0.449753,-0.387993,0.714435
1,0.758887,0.383400,-0.755709,0.857785,-0.248809,-2.101839,-0.010296,1.199679,-0.795632,0.846643,...,0.042556,0.173743,-0.161217,-0.498572,1.225538,0.783384,0.669568,-0.291257,-0.200389,0.138534
2,-0.124830,0.831694,1.300821,0.218716,0.472179,1.105686,0.260671,-0.352529,-0.025069,-2.182070,...,-0.166515,-0.064395,-0.501992,0.323407,0.614846,0.825376,-0.548872,1.157104,1.064527,0.798807
3,-0.080759,-0.346017,1.523577,0.417540,-0.017775,-1.274081,0.420140,-0.778621,1.275734,0.228363,...,-0.229313,-0.144727,-0.858710,1.637962,-0.444260,0.063644,-1.298036,0.683254,0.185788,1.683973
4,-0.147865,0.145323,-1.834255,0.597837,0.287576,-0.958308,0.853622,0.261309,0.246742,-0.640867,...,2.390158,-0.948949,-0.499374,-0.154233,0.269111,0.069811,-0.227657,-0.935418,-1.460935,0.731837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,2.459842,-0.093219,1.184535,-1.557862,-0.141881,0.469044,-3.094403,-0.258323,0.052064,-1.263803,...,-0.225499,-1.774788,0.144422,-0.306673,-0.294339,-0.173120,-0.654812,1.229948,-2.630918,0.569269
14996,1.327691,-0.698061,0.337938,0.075932,2.663795,-1.139527,1.046355,0.349043,-0.096646,0.871605,...,-0.958854,1.129971,0.990932,1.445285,-0.972852,0.339433,0.725879,-0.055061,0.369996,-0.740846
14997,0.798383,-2.351165,-0.057665,0.464167,-1.493883,-1.295085,0.355523,-0.661240,0.321882,-2.962879,...,1.911683,-1.186459,-0.676271,0.843900,0.114599,-0.853808,2.222003,-0.358561,0.279108,0.915069
14998,0.300703,0.159452,-1.495546,-0.516834,-0.228474,0.671990,-0.801756,0.235947,-1.885741,1.335320,...,-0.656783,0.126194,-1.544740,0.393907,-0.079574,0.020257,-0.096382,0.239851,0.167226,-0.103390


In [15]:
df.loc[:, "target"] = y
df

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_91,f_92,f_93,f_94,f_95,f_96,f_97,f_98,f_99,target
0,-1.446426,0.470304,-0.090037,1.372448,0.175086,-0.259279,0.372518,-1.249224,-0.204630,1.003942,...,0.150155,-0.513067,0.733901,0.212853,-0.006295,0.003343,-0.449753,-0.387993,0.714435,67.571688
1,0.758887,0.383400,-0.755709,0.857785,-0.248809,-2.101839,-0.010296,1.199679,-0.795632,0.846643,...,0.173743,-0.161217,-0.498572,1.225538,0.783384,0.669568,-0.291257,-0.200389,0.138534,44.427264
2,-0.124830,0.831694,1.300821,0.218716,0.472179,1.105686,0.260671,-0.352529,-0.025069,-2.182070,...,-0.064395,-0.501992,0.323407,0.614846,0.825376,-0.548872,1.157104,1.064527,0.798807,174.089718
3,-0.080759,-0.346017,1.523577,0.417540,-0.017775,-1.274081,0.420140,-0.778621,1.275734,0.228363,...,-0.144727,-0.858710,1.637962,-0.444260,0.063644,-1.298036,0.683254,0.185788,1.683973,107.836263
4,-0.147865,0.145323,-1.834255,0.597837,0.287576,-0.958308,0.853622,0.261309,0.246742,-0.640867,...,-0.948949,-0.499374,-0.154233,0.269111,0.069811,-0.227657,-0.935418,-1.460935,0.731837,-49.232819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,2.459842,-0.093219,1.184535,-1.557862,-0.141881,0.469044,-3.094403,-0.258323,0.052064,-1.263803,...,-1.774788,0.144422,-0.306673,-0.294339,-0.173120,-0.654812,1.229948,-2.630918,0.569269,59.427812
14996,1.327691,-0.698061,0.337938,0.075932,2.663795,-1.139527,1.046355,0.349043,-0.096646,0.871605,...,1.129971,0.990932,1.445285,-0.972852,0.339433,0.725879,-0.055061,0.369996,-0.740846,-47.190825
14997,0.798383,-2.351165,-0.057665,0.464167,-1.493883,-1.295085,0.355523,-0.661240,0.321882,-2.962879,...,-1.186459,-0.676271,0.843900,0.114599,-0.853808,2.222003,-0.358561,0.279108,0.915069,19.209171
14998,0.300703,0.159452,-1.495546,-0.516834,-0.228474,0.671990,-0.801756,0.235947,-1.885741,1.335320,...,0.126194,-1.544740,0.393907,-0.079574,0.020257,-0.096382,0.239851,0.167226,-0.103390,78.649798


In [16]:
df = create_folds(df)

In [17]:
df

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_92,f_93,f_94,f_95,f_96,f_97,f_98,f_99,target,kfold
0,-0.420820,-0.775485,-0.899912,-0.737377,0.230680,0.480220,0.246977,0.359241,-0.064467,-0.551475,...,-1.649873,-0.235341,-1.031907,0.538411,-0.254792,-1.381359,0.192093,1.115996,329.201549,0
1,0.955763,1.137238,0.140497,-0.630803,-0.586089,-1.734099,-1.126918,-0.513949,1.720259,0.307815,...,-2.549604,-0.889203,0.191451,0.520759,-0.995649,-1.486973,-2.341092,-1.330036,-465.869007,0
2,0.679056,0.493116,-0.786007,0.247171,1.446704,-0.941954,-0.072923,-0.235213,0.066479,-1.288300,...,-0.546470,1.679066,0.058205,-0.274601,1.365115,1.269930,-0.139062,-0.052081,-278.602492,0
3,1.216333,-0.605094,0.584830,-0.104748,1.302006,0.969088,1.302245,2.472836,2.287552,0.920830,...,-0.527964,-0.497061,-2.575184,0.552254,-1.704082,0.878910,0.434368,-1.022217,-120.881636,0
4,0.373347,-0.340923,-1.217958,0.419094,-2.892072,0.421158,-0.104374,0.986096,0.573438,-1.213808,...,0.812481,-0.372855,-1.451263,0.075695,-0.663683,-1.607947,0.306335,-2.301922,-249.178239,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,-1.601082,0.303070,-0.961472,-1.358658,-0.804425,-0.843487,-1.205186,-0.867022,0.994244,-0.334204,...,-0.038560,2.623558,-0.737655,2.064614,-0.670583,-1.085507,-1.361789,0.177814,40.220166,4
14996,-1.324112,0.628909,-1.026852,0.487514,-0.434833,0.417675,0.296053,-1.263383,2.452897,1.625668,...,0.337797,-0.836403,-1.702048,1.413179,2.021385,-0.375915,-0.563681,1.324238,323.044344,4
14997,0.391676,1.541947,0.635960,0.823184,0.508243,-1.882134,-1.882343,-0.968010,1.339621,0.445460,...,-0.938288,0.006150,-0.841728,-0.198435,-0.190046,0.170461,-0.990694,-0.817175,-366.521133,4
14998,0.370284,-1.950852,-0.632037,0.315585,0.368981,-0.307664,-0.878831,-0.367437,1.749265,0.594517,...,0.676255,0.229375,0.551776,-0.354083,-0.115323,-1.330349,0.929902,0.956446,-37.449918,4
