# Imports

In [108]:
import pandas as pd
import numpy as np

#sklearn
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit


# Utils
import random
import os
from tqdm import tqdm

# pretty table
from prettytable import ALL as ALL
from prettytable import PrettyTable

SEED = 42

In [2]:
def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
set_seed()

# Cascading Classifier

In [3]:
class CascadingTreeClassifier:
    def __init__(self, cascading_depth, threshold):
        self.depth = cascading_depth
        self.threshold = threshold
        self.trees = self.generate_trees(cascading_depth)
        self.tree_counter = {i+1: 0 for i in range(cascading_depth)}
        self.descison_counter = {i+1: 0 for i in range(cascading_depth)}

    @staticmethod
    def generate_trees(cascading_depth):
        trees = [DecisionTreeClassifier(max_depth=i, random_state=SEED+i) for i in range(1, cascading_depth + 1)]
        return trees

    def fit(self, X, y):
        _ = [t.fit(X, y) for t in self.trees]
        return self


    def predict_instance(self, x):
        for i, tree in enumerate(self.trees):
            confidence_arr = tree.predict_proba(x.reshape(1, - 1))
            self.tree_counter[i + 1] += 1
            if confidence_arr.max() > self.threshold:
                self.descison_counter[i+1] += 1
                return confidence_arr
        
        self.descison_counter[i+1] += 1
        return confidence_arr

    def predict_proba(self, X):
        predictions = np.apply_along_axis(func1d=self.predict_instance, axis=1, arr=X)
        predictions = np.squeeze(predictions, axis=1)
        return predictions

    def predict(self, X):
        proba_pred = self.predict_proba(X)
        classes_pred = np.argmax(proba_pred, axis=1)
        return classes_pred

    


In [4]:
def generate_batch_indices(indices_range, batch_num, op):
    batches = []
    idxs = set()
    
    batch_size = np.ceil(indices_range / (batch_num - (batch_num - 1) * op)).astype(int)

    for _ in range(batch_num):
        b = []
        b_size_left = batch_size

        if len(idxs) < batch_size:
            b = list(idxs)
            b_size_left -= len(idxs)
            idxs = set()

        if not idxs:
            idxs = set(range(indices_range))


        b += random.sample(idxs, b_size_left)
        batches.append(b)
        
        idxs = idxs - set(b)
    
    return batches

In [5]:
class AdvancedCascadingTreeClassifier(CascadingTreeClassifier):
    def __init__(self, cascading_depth, threshold, overlapping_percentage):
        super().__init__(cascading_depth, threshold)
        self.overlapping_percentage = overlapping_percentage

    def fit(self, X, y):        
        batch_indices = generate_batch_indices(X.shape[0], self.depth, self.overlapping_percentage)
        
        for i, tree in enumerate(self.trees):
            tree.fit(X[batch_indices[i],:], y[batch_indices[i]])
            
        return self

In [6]:
class FeatureCascadingTreeClassifier(CascadingTreeClassifier):
    def __init__(self, cascading_depth, threshold, overlapping_percentage):
        super().__init__(cascading_depth, threshold)
        self.overlapping_percentage = overlapping_percentage
        

    def fit(self, X, y):
        self.feature_indices = generate_batch_indices(X.shape[1], self.depth, self.overlapping_percentage)

        for i, tree in enumerate(self.trees):
            tree.fit(X[:,self.feature_indices[i]], y)
            
        return self
    
    def predict_instance(self, x):
        for i, tree in enumerate(self.trees):
            confidence_arr = tree.predict_proba(x[self.feature_indices[i]].reshape(1, - 1))
            self.tree_counter[i + 1] += 1
            if confidence_arr.max() > self.threshold:
                self.descison_counter[i+1] += 1
                return confidence_arr
        
        self.descison_counter[i+1] += 1
        return confidence_arr

In [7]:
class CombineCascadingTreeClassifier(CascadingTreeClassifier):
    def __init__(self, cascading_depth, threshold, overlapping_percentage):
        super().__init__(cascading_depth, threshold)
        self.overlapping_percentage = overlapping_percentage
        

    def fit(self, X, y):
        self.feature_indices = generate_batch_indices(X.shape[1], self.depth, self.overlapping_percentage)
        batch_indices = generate_batch_indices(X.shape[0], self.depth, self.overlapping_percentage)

        for i, tree in enumerate(self.trees):
            tree.fit(X.take(batch_indices[i], axis=0).take(self.feature_indices[i], axis=1), y[batch_indices[i]])
            
        return self
    
    def predict_instance(self, x):
        for i, tree in enumerate(self.trees):
            confidence_arr = tree.predict_proba(x[self.feature_indices[i]].reshape(1, - 1))
            self.tree_counter[i + 1] += 1
            if confidence_arr.max() > self.threshold:
                self.descison_counter[i+1] += 1
                return confidence_arr
        
        self.descison_counter[i+1] += 1
        return confidence_arr

# Evaluate

In [68]:
def evaluate(extract_X_y):
    
    # prepare train/test
    X,y = extract_X_y()
    y =  LabelEncoder().fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
    
    
    eval_reuslts = {}
    # vanila cdt evaluate
    set_seed()
    cdt = CascadingTreeClassifier(15, 0.95)
    test_acc, test_loss, train_acc, train_loss = evaluate_by_cdt(cdt, X_train, X_test, y_train, y_test)
    eval_reuslts.update({'vanila_test_acc': test_acc, 'vanila_test_loss': test_loss, 'vanila_train_acc': train_acc, 'vanila_train_loss': train_loss})
    
    # cdt advanced, overlapping data
    set_seed()
    cdt_advanced = AdvancedCascadingTreeClassifier(15, 0.95, 0.9)
    test_acc, test_loss, train_acc, train_loss =  evaluate_by_cdt(cdt_advanced, X_train, X_test, y_train, y_test)
    eval_reuslts.update({'advanced_test_acc': test_acc, 'advanced_test_loss': test_loss, 'advanced_train_acc': train_acc, 'advanced_train_loss': train_loss})
    
    
    # cdt by features
    set_seed()
    cdt_features = FeatureCascadingTreeClassifier(15, 0.95, 0.95)
    test_acc, test_loss, train_acc, train_loss = evaluate_by_cdt(cdt_features, X_train, X_test, y_train, y_test)
    eval_reuslts.update({'features_test_acc': test_acc, 'features_test_loss': test_loss, 'features_train_acc': train_acc, 'features_train_loss': train_loss})
    
    #  cdt combined, combination of cdt_features + cdt_advanced
    set_seed()
    cdt_combine = CombineCascadingTreeClassifier(15, 0.95, 0.98)
    test_acc, test_loss, train_acc, train_loss = evaluate_by_cdt(cdt_combine, X_train, X_test, y_train, y_test)
    eval_reuslts.update({'combined_test_acc': test_acc, 'combined_test_loss': test_loss, 'combined_train_acc': train_acc, 'combined_train_loss': train_loss})
    
    return eval_reuslts
    
    
    
    

In [69]:
def evaluate_by_cdt(cdt, X_train, X_test, y_train, y_test):
    cdt.fit(X_train, y_train)
    train_probs, train_preds, test_probs, test_preds  = cdt.predict_proba(X_train), cdt.predict(X_train), cdt.predict_proba(X_test),  cdt.predict(X_test)
    
    test_acc, test_loss= accuracy_score(y_test, test_preds), log_loss(y_test, test_probs)
    train_acc, train_loss =  accuracy_score(y_train, train_preds), log_loss(y_train, train_probs)
    
    return test_acc, test_loss, train_acc, train_loss
    

## Datasets Functions

In [73]:
def fetal_health():
    if not os.path.exists(r'./datasets/fetal_health.csv'):
        !kaggle datasets download -d andrewmvd/fetal-health-classification  -f 'fetal_health.csv' -p './datasets/'
    df_fetal = pd.read_csv(r'./datasets/fetal_health.csv')
    X, y = df_fetal.iloc[:,:-1].values, df_fetal['fetal_health'].astype(int).values
    
    return X,y

In [71]:
def Frogs_MFCCs(): # 
    if not os.path.exists(r'./datasets/Frogs_MFCCs.csv'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00406/Anuran%20Calls%20(MFCCs).zip' -P './datasets/'
        !unzip -q './datasets/Anuran Calls (MFCCs).zip' 'Frogs_MFCCs.csv' -d './datasets/' && rm './datasets/Anuran Calls (MFCCs).zip'

    df_frogs = pd.read_csv(r'./datasets/Frogs_MFCCs.csv')
    X = df_frogs.iloc[:, :22].values
    y = df_frogs['Genus'].values
    
    return X,y


In [99]:
def avila(): # https://archive.ics.uci.edu/ml/machine-learning-databases/00459/
    if not os.path.exists(r'./datasets/avila-tr.txt'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00459/avila.zip' -P './datasets/'
        !unzip -q './datasets/avila.zip' 'avila/avila-tr.txt' -d './datasets/' && rm './datasets/avila.zip'
        !mv './datasets/avila/avila-tr.txt' './datasets/avila-tr.txt' && rmdir './datasets/avila'

    col_names = ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'Class']
    df_avila = pd.read_csv(r'./datasets/avila-tr.txt', delimiter = ",", names = col_names)
    X, y = df_avila.iloc[:,:-1].values, df_avila['Class'].values
    
    return X,y


In [119]:
def log2(): # https://archive.ics.uci.edu/ml/machine-learning-databases/00542/
    
    if not os.path.exists(r'./datasets/log2.csv'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00542/log2.csv' -P './datasets/'


    df_log2 = pd.read_csv(r'./datasets/log2.csv')
    cols_order = list(df_log2.columns)
    cols_order[-1], cols_order[cols_order.index('Action')] = cols_order[cols_order.index('Action')], cols_order[-1]
    df_log2 = df_log2[cols_order]
    X, y = df_log2.iloc[:,:-1].values, df_log2['Action'].values
    X = MinMaxScaler().fit_transform(X)
    
    return X,y

In [124]:
def wine_quality_red(): # https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/
    if not os.path.exists(r'./datasets/winequality-red.csv'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' -P './datasets/'


    df_wine_quality_red = pd.read_csv(r'./datasets/winequality-red.csv', delimiter=';')
    X, y = df_wine_quality_red.iloc[:,:-1].values, df_wine_quality_red['quality'].values
    X = MinMaxScaler().fit_transform(X)
    
    return X,y

In [139]:
def wine_quality_white(): # https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/
    if not os.path.exists(r'./datasets/winequality-white.csv'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv' -P './datasets/'


    df_wine_quality_white = pd.read_csv(r'./datasets/winequality-white.csv', delimiter=';')
    df_wine_quality_white = df_wine_quality_white[df_wine_quality_white['quality'] != 9]
    X, y = df_wine_quality_white.iloc[:,:-1].values, df_wine_quality_white['quality'].values
    X = MinMaxScaler().fit_transform(X)
    
    return X,y

In [157]:
def CTG(): # https://archive.ics.uci.edu/ml/datasets/cardiotocography
    if not os.path.exists(r'./datasets/CTG.xls'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00193/CTG.xls' -P './datasets/'


    df_ctg = pd.read_excel(r'./datasets/CTG.xlsx')
    X, y = df_ctg.iloc[:,:-1].values, df_ctg['CLASS'].values
    X = MinMaxScaler().fit_transform(X)
    
    return X,y
    

In [162]:
def Dry_beans(): # https://archive.ics.uci.edu/ml/machine-learning-databases/00602/
    if not os.path.exists(r'./datasets/Dry_Bean_Dataset.xlsx'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00602/DryBeanDataset.zip' -P './datasets/'
        !unzip -q './datasets/DryBeanDataset.zip' 'DryBeanDataset/Dry_Bean_Dataset.xlsx' -d './datasets/' && rm './datasets/DryBeanDataset.zip'
        !mv './datasets/DryBeanDataset/Dry_Bean_Dataset.xlsx' './datasets/Dry_Bean_Dataset.xlsx' && rmdir './datasets/DryBeanDataset'

    df_dry_beans = pd.read_excel(r'./datasets/Dry_Bean_Dataset.xlsx')
    X, y = df_dry_beans.iloc[:,:-1].values, df_dry_beans['Class'].values
    X = MinMaxScaler().fit_transform(X)
    
    return X,y
    

In [None]:

def Dry_beans(): # https://archive.ics.uci.edu/ml/machine-learning-databases/00241/
    if not os.path.exists(r'./datasets/Dry_Bean_Dataset.xlsx'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00602/DryBeanDataset.zip' -P './datasets/'
        !unzip -q './datasets/DryBeanDataset.zip' 'DryBeanDataset/Dry_Bean_Dataset.xlsx' -d './datasets/' && rm './datasets/DryBeanDataset.zip'
        !mv './datasets/DryBeanDataset/Dry_Bean_Dataset.xlsx' './datasets/Dry_Bean_Dataset.xlsx' && rmdir './datasets/DryBeanDataset'

    df_dry_beans = pd.read_excel(r'./datasets/Dry_Bean_Dataset.xlsx')
    X, y = df_dry_beans.iloc[:,:-1].values, df_dry_beans['Class'].values
    X = MinMaxScaler().fit_transform(X)
    
    return X,y

In [166]:

def Leaves_sha_64(): # https://archive.ics.uci.edu/ml/machine-learning-databases/00241/
    if not os.path.exists(r'./datasets/data_Sha_64.txt'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00241/100%20leaves%20plant%20species.zip' -P './datasets/'
        !unzip -q './datasets/100 leaves plant species.zip' '100 leaves plant species/data_Sha_64.txt' -d './datasets/' && rm './datasets/100 leaves plant species.zip'
        !mv './datasets/100 leaves plant species/data_Sha_64.txt' './datasets/data_Sha_64.txt' && rmdir './datasets/100 leaves plant species'

    df_leaves_type = pd.read_csv(r'./datasets/data_Sha_64.txt', delimiter=',', header=None)
    cols_order = list(df_leaves_type.columns)
    cols_order[-1], cols_order[cols_order.index(0)] = cols_order[cols_order.index(0)], cols_order[-1]
    df_leaves_type = df_leaves_type[cols_order]
    X, y = df_leaves_type.iloc[:,:-1].values, df_leaves_type[0].values
    
    return X,y

In [167]:
Leaves_sha_64()

(array([[0.00061401, 0.00057884, 0.00060866, ..., 0.00062541, 0.0006241 ,
         0.00061671],
        [0.00066691, 0.00063028, 0.00066074, ..., 0.00064182, 0.00066119,
         0.00067058],
        [0.00060235, 0.00061634, 0.00061527, ..., 0.00054334, 0.00059248,
         0.00060658],
        ...,
        [0.00066291, 0.00070062, 0.00064198, ..., 0.00057423, 0.00060479,
         0.00062159],
        [0.00070234, 0.00070879, 0.00077631, ..., 0.00061398, 0.00065001,
         0.00068316],
        [0.0007965 , 0.00084963, 0.00076958, ..., 0.00076182, 0.00079923,
         0.00080984]]),
 array(['Acer Capillipes', 'Acer Capillipes', 'Acer Capillipes', ...,
        'Zelkova Serrata', 'Zelkova Serrata', 'Zelkova Serrata'],
       dtype=object))

# Compare Classifiers

In [169]:

datasets = [Leaves_sha_64]
result_table = PrettyTable(hrules=ALL)


result_table.field_names = ['Dataset',
                            'Vanila_Acc', 'Vanila_Loss',
                            'Advanced_Acc' , 'Advanced_loss',
                            'Features_Acc' , 'Features_loss',
                            'Combined_Acc' , 'Combined_loss']

for dataset in tqdm(datasets):
    result = evaluate(dataset)
    result_table.add_row([dataset.__name__, '', '', '', '', '', '', '', ''])
    
    for key in result.keys():
        if key.endswith('loss'):
            result[key] = f'{result[key]:.4f}'
        else:
            result[key] = f'{result[key] * 100:.2f}%'
            
    result_table.add_row(['train', result['vanila_train_acc'], result['vanila_train_loss'],
                                   result['advanced_train_acc'], result['advanced_train_loss'],
                                   result['features_train_acc'], result['features_train_loss'],
                                   result['combined_train_acc'], result['combined_train_loss']])
                          
    result_table.add_row(['test', result['vanila_test_acc'], result['vanila_test_loss'],
                                  result['advanced_test_acc'], result['advanced_test_loss'],
                                  result['features_test_acc'], result['features_test_loss'],
                                  result['combined_test_acc'], result['combined_test_loss']])
print(result_table)





  0%|          | 0/1 [00:00<?, ?it/s][A[A[A[A

ValueError: could not broadcast input array from shape (1,98) into shape (1,100)

# Dataset 9

In [158]:

df_leaves_type

Unnamed: 0,64,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,0
0,0.000614,0.000579,0.000609,0.000551,0.000554,0.000603,0.000614,0.000611,0.000611,0.000611,...,0.000507,0.000533,0.000555,0.000565,0.000581,0.000597,0.000625,0.000624,0.000617,Acer Capillipes
1,0.000667,0.000630,0.000661,0.000719,0.000651,0.000643,0.000640,0.000646,0.000624,0.000584,...,0.000503,0.000520,0.000533,0.000564,0.000596,0.000623,0.000642,0.000661,0.000671,Acer Capillipes
2,0.000602,0.000616,0.000615,0.000606,0.000568,0.000558,0.000552,0.000551,0.000552,0.000531,...,0.000581,0.000590,0.000589,0.000566,0.000575,0.000618,0.000543,0.000592,0.000607,Acer Capillipes
3,0.000603,0.000613,0.000569,0.000564,0.000607,0.000643,0.000647,0.000663,0.000658,0.000635,...,0.000516,0.000535,0.000549,0.000542,0.000566,0.000592,0.000601,0.000609,0.000614,Acer Capillipes
4,0.000594,0.000599,0.000552,0.000558,0.000569,0.000616,0.000639,0.000631,0.000634,0.000639,...,0.000529,0.000543,0.000557,0.000562,0.000558,0.000591,0.000608,0.000613,0.000610,Acer Capillipes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,0.000740,0.000740,0.000718,0.000674,0.000631,0.000579,0.000549,0.000505,0.000466,0.000449,...,0.000431,0.000481,0.000513,0.000569,0.000612,0.000663,0.000707,0.000735,0.000800,Zelkova Serrata
1596,0.000663,0.000672,0.000650,0.000643,0.000607,0.000572,0.000527,0.000501,0.000457,0.000440,...,0.000392,0.000420,0.000433,0.000469,0.000492,0.000532,0.000567,0.000605,0.000631,Zelkova Serrata
1597,0.000663,0.000701,0.000642,0.000646,0.000612,0.000569,0.000531,0.000492,0.000453,0.000427,...,0.000358,0.000396,0.000422,0.000461,0.000498,0.000538,0.000574,0.000605,0.000622,Zelkova Serrata
1598,0.000702,0.000709,0.000776,0.000716,0.000734,0.000708,0.000664,0.000618,0.000581,0.000543,...,0.000425,0.000459,0.000482,0.000500,0.000546,0.000578,0.000614,0.000650,0.000683,Zelkova Serrata


In [160]:
df_leaves_type = pd.read_csv(r'./datasets/data_Sha_64.txt', delimiter=',', header=None)
cols_order = list(df_leaves_type.columns)
cols_order[-1], cols_order[cols_order.index(0)] = cols_order[cols_order.index(0)], cols_order[-1]
df_leaves_type = df_leaves_type[cols_order]

X, y = df_leaves_type.iloc[:,:-1].values, df_leaves_type[0].values
y =  LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

0       Acer Capillipes
1       Acer Capillipes
2       Acer Capillipes
3       Acer Capillipes
4       Acer Capillipes
             ...       
1595    Zelkova Serrata
1596    Zelkova Serrata
1597    Zelkova Serrata
1598    Zelkova Serrata
1599    Zelkova Serrata
Name: 0, Length: 1600, dtype: object

# Dataset 10 - need to handle missing values and string values ...


In [168]:
df_nuclear = pd.read_csv(r'./datasets/Data_Cortex_Nuclear.csv')[1:]
del df_nuclear['MouseID']
df_nuclear

Unnamed: 0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class
1,0.514617,0.689064,0.411770,2.789514,5.685038,0.211636,0.172817,2.292150,0.226972,1.596377,...,0.104315,0.441581,0.111974,0.135103,0.131119,1.743610,Control,Memantine,C/S,c-CS-m
2,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,1.561316,...,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,Control,Memantine,C/S,c-CS-m
3,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,1.595086,...,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563,Control,Memantine,C/S,c-CS-m
4,0.434940,0.617430,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,1.504230,...,0.110694,0.434154,0.118481,0.140314,0.148380,1.839730,Control,Memantine,C/S,c-CS-m
5,0.447506,0.628176,0.367388,2.385939,4.807635,0.218578,0.176233,2.141282,0.195188,1.442398,...,0.109446,0.439833,0.116657,0.140766,0.142180,1.816389,Control,Memantine,C/S,c-CS-m
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,0.254860,0.463591,0.254860,2.092082,2.600035,0.211736,0.171262,2.483740,0.207317,1.057971,...,0.183324,0.374088,0.318782,0.204660,0.328327,1.364823,Ts65Dn,Saline,S/C,t-SC-s
1076,0.272198,0.474163,0.251638,2.161390,2.801492,0.251274,0.182496,2.512737,0.216339,1.081150,...,0.175674,0.375259,0.325639,0.200415,0.293435,1.364478,Ts65Dn,Saline,S/C,t-SC-s
1077,0.228700,0.395179,0.234118,1.733184,2.220852,0.220665,0.161435,1.989723,0.185164,0.884342,...,0.158296,0.422121,0.321306,0.229193,0.355213,1.430825,Ts65Dn,Saline,S/C,t-SC-s
1078,0.221242,0.412894,0.243974,1.876347,2.384088,0.208897,0.173623,2.086028,0.192044,0.922595,...,0.196296,0.397676,0.335936,0.251317,0.365353,1.404031,Ts65Dn,Saline,S/C,t-SC-s


In [170]:
X, y = df_nuclear.iloc[:,:-1].values, df_nuclear['class'].values
y =  LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)