In [9]:
import numpy as np
import pandas as pd
from skmultilearn.model_selection import IterativeStratification

In [2]:
# Set random seed for reproducibility
np.random.seed(42)

# Number of rows in the dataset
num_rows = 1000

# Generate categorical variables
categories_1 = ['A', 'B', 'C']
categories_2 = ['D', 'E', 'F']
cat1 = np.random.choice(categories_1, num_rows, True,  p=[0.1,  0.3, 0.6])# just some random probabilities so that categories are not evenly distributed
cat2 = np.random.choice(categories_2, num_rows, True,  p=[0.6, 0.1, 0.3])

# Generate numerical variables following normal distributions
num1 = np.random.normal(loc=0, scale=1, size=num_rows)
num2 = np.random.normal(loc=100, scale=1, size=num_rows)

# Create DataFrame
data = {
    'cat1': cat1,
    'cat2': cat2,
    'num1': num1,
    'num2': num2
}

df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
print(df.head())

  cat1 cat2      num1        num2
0    B    D -0.877983  101.870965
1    C    D -0.826880  100.389614
2    C    F -0.226479   99.131707
3    C    F  0.367366  100.534629
4    B    F  0.913585   97.364252


In [6]:
def iterative_split(df, test_size, stratify_columns):
    """Custom iterative train test split which
    'maintains balanced representation with respect
    to order-th label combinations.'
    """
    # One-hot encode the stratify columns and concatenate them. This is to keep the proportions in the subsample the same
    one_hot_cols = [pd.get_dummies(df[col]) for col in stratify_columns]
    one_hot_cols = pd.concat(one_hot_cols, axis=1).to_numpy()
    stratifier = IterativeStratification(
        n_splits=2, order=len(stratify_columns), sample_distribution_per_fold=[test_size, 1-test_size])
    train_indices, test_indices = next(stratifier.split(df.to_numpy(), one_hot_cols))
    # Return the train and test set dataframes
    train, test = df.iloc[train_indices], df.iloc[test_indices]
    return train, test

In [11]:
print(f'{df[["cat1"]].value_counts(normalize = True) =}')
print(f'{train[["cat1"]].value_counts(normalize = True) =}')
print(f'{test[["cat1"]].value_counts(normalize = True)  =}')


df[["cat1"]].value_counts(normalize = True) =cat1
C       0.579
B       0.313
A       0.108
Name: proportion, dtype: float64
train[["cat1"]].value_counts(normalize = True) =cat1
C       0.578333
B       0.313333
A       0.108333
Name: proportion, dtype: float64
test[["cat1"]].value_counts(normalize = True)  =cat1
C       0.5800
B       0.3125
A       0.1075
Name: proportion, dtype: float64


In [10]:
print(f'{df[["cat2"]].value_counts(normalize = True) =}')
print(f'{train[["cat2"]].value_counts(normalize = True) =}')
print(f'{test[["cat2"]].value_counts(normalize = True)  =}')

df[["cat2"]].value_counts(normalize = True) =cat2
D       0.570
F       0.307
E       0.123
Name: proportion, dtype: float64
train[["cat2"]].value_counts(normalize = True) =cat2
D       0.570000
F       0.308333
E       0.121667
Name: proportion, dtype: float64
test[["cat2"]].value_counts(normalize = True)  =cat2
D       0.570
F       0.305
E       0.125
Name: proportion, dtype: float64
