In [37]:
import sys
import os
sys.path.insert(0, os.path.abspath(".."))

import ast
from slim_gsgp.datasets.data_loader import load_pandas_df
import pandas as pd
import numpy as np
from slim_gsgp.main_gp import gp
from slim_gsgp.main_slim import slim
from slim_gsgp.main_gsgp import gsgp
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from imblearn.over_sampling import SMOTENC, SMOTE

In [None]:
info = pd.read_csv("info.csv")

#make dtype list for indices
info['test_indices'] = info['test_indices'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
info['train_indices'] = info['train_indices'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [39]:
info

Unnamed: 0,name,n_samples,n_features,imbalance,categoricals,n_categoricals,train_indices,test_indices
0,blood,748,4,0.237968,[],0,"[[255, 174, 420, 341, 186, 512, 635, 190, 450,...","[[343, 644, 607, 22, 82, 105, 681, 292, 451, 5..."
1,clima,540,18,0.085185,[],0,"[[409, 132, 372, 52, 215, 156, 410, 76, 128, 2...","[[495, 424, 200, 281, 221, 500, 81, 515, 432, ..."
2,eeg,14980,14,0.448798,[],0,"[[4155, 5750, 12565, 7790, 12551, 963, 14035, ...","[[5237, 10277, 9601, 1187, 5396, 7470, 10820, ..."
3,fertility,100,9,0.12,"['season', 'child_diseases', 'accident', 'surg...",7,"[[61, 5, 1, 70, 24, 40, 63, 52, 91, 99, 32, 47...","[[74, 46, 2, 33, 82, 25, 26, 11, 90, 15, 71, 8..."
4,gina,3153,970,0.491595,[],0,"[[2470, 503, 112, 1455, 1167, 1112, 2227, 1308...","[[49, 918, 536, 2682, 1079, 2440, 1168, 2148, ..."
5,hill,1212,100,0.5,[],0,"[[113, 547, 693, 906, 753, 537, 979, 803, 169,...","[[331, 231, 1044, 85, 882, 835, 155, 570, 253,..."
6,ilpd,583,10,0.286449,[],0,"[[213, 76, 343, 434, 44, 226, 532, 70, 493, 56...","[[208, 273, 442, 479, 478, 319, 224, 522, 474,..."
7,kc,2109,21,0.154576,[],0,"[[571, 2078, 248, 1348, 549, 603, 1085, 466, 1...","[[81, 968, 757, 193, 823, 1191, 1079, 1858, 17..."
8,liver,345,6,0.42029,[],0,"[[194, 281, 228, 140, 78, 40, 302, 285, 43, 15...","[[130, 52, 238, 125, 249, 204, 257, 307, 4, 79..."
9,musk,476,166,0.434874,[],0,"[[246, 107, 32, 180, 92, 46, 319, 132, 396, 14...","[[442, 423, 363, 43, 237, 407, 283, 271, 244, ..."


In [None]:
def oversample(df, categoricals = []):
    
    #if list is empty
    if not categoricals:
        sm = SMOTE(random_state = 42, categorical_features = categoricals)
        
    else:
        sm = SMOTENC(random_state = 42)
    
    sm
    
    return sm.fit_resample(df)

In [None]:
def return_train_test(df, train_indices, test_indices, oversampling = False, categoricals = []):
    
    train = df.iloc[train_indices]
    test = df.iloc[test_indices]
    
    if oversampling:
        train = oversample(df, categoricals)
    
    X_train, y_train = load_pandas_df(train, X_y=True)
    X_test, y_test = load_pandas_df(test, X_y=True)
        
        
    return X_train, y_train, X_test, y_test
    

In [None]:
def monte_carlo_cv(df, data_name, info, n_runs, oversampling = False, categoricals = []):
    
    
    for i in range(1, n_runs+1, 1):
        
        
        
        train_indices = info.loc[info['name']== data_name, 'train_indices'][0][i-1]
        test_indices = info.loc[info['name']== data_name, 'test_indices'][0][i-1]
        
        X_train, y_train, X_test, y_test = return_train_test(df, train_indices, test_indices, oversampling, categoricals)
        
        

In [24]:
info['test_indices'] = info['test_indices'].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)
info['test_indices'][0]

'[[343, 644, 607, 22, 82, 105, 681, 292, 451, 594, 554, 634, 570, 272, 391, 568, 411, 498, 402, 362, 394, 166, 436, 646, 265, 515, 369, 706, 495, 572, 210, 742, 712, 156, 70, 447, 373, 107, 238, 7, 351, 464, 262, 431, 448, 140, 485, 434, 322, 511, 52, 115, 125, 649, 390, 687, 721, 136, 501, 209, 689, 202, 641, 65, 546, 536, 537, 169, 111, 543, 500, 243, 295, 647, 628, 257, 193, 740, 109, 522, 676, 731, 539, 168, 121, 91, 223, 527, 228, 237, 289, 206, 229, 393, 12, 489, 226, 317, 636, 381, 670, 547, 217, 339, 304, 33, 623, 234, 668, 293, 469, 691, 442, 138, 722, 376, 288, 276, 637, 155, 270, 477, 338, 35, 312, 72, 135, 531, 620, 62, 56, 517, 275, 361, 191, 470, 581, 682, 504, 726, 661, 281, 285, 621, 302, 632, 558, 360, 378, 626, 453, 388, 664, 277, 29, 578, 573, 24, 471, 586, 701, 301, 617, 310, 710, 141, 76, 452, 309, 499, 430, 139, 416, 487, 84, 462, 20, 328, 235, 474, 698, 164, 55, 244, 153, 516, 40, 368, 316, 413, 299, 540, 297, 252, 455, 533, 189, 510, 214, 629, 427, 46, 638, 315,

In [None]:
def train_model(model_name, df, data)

[255,
 174,
 420,
 341,
 186,
 512,
 635,
 190,
 450,
 491,
 124,
 9,
 692,
 92,
 377,
 323,
 566,
 456,
 331,
 556,
 98,
 31,
 610,
 231,
 66,
 201,
 736,
 475,
 657,
 410,
 146,
 611,
 513,
 458,
 131,
 669,
 313,
 709,
 506,
 335,
 423,
 207,
 606,
 151,
 253,
 114,
 580,
 311,
 354,
 545,
 15,
 34,
 108,
 449,
 284,
 345,
 142,
 28,
 233,
 704,
 476,
 163,
 370,
 359,
 700,
 227,
 355,
 645,
 350,
 686,
 490,
 564,
 703,
 590,
 357,
 651,
 267,
 147,
 639,
 642,
 104,
 562,
 367,
 137,
 665,
 678,
 30,
 216,
 134,
 524,
 96,
 472,
 520,
 426,
 194,
 625,
 291,
 663,
 433,
 47,
 129,
 425,
 372,
 671,
 298,
 159,
 259,
 479,
 184,
 263,
 557,
 741,
 21,
 171,
 467,
 702,
 59,
 48,
 94,
 148,
 614,
 660,
 509,
 408,
 6,
 461,
 383,
 648,
 747,
 333,
 182,
 78,
 283,
 593,
 672,
 694,
 631,
 719,
 51,
 356,
 693,
 380,
 565,
 555,
 213,
 605,
 401,
 609,
 280,
 320,
 603,
 116,
 45,
 727,
 724,
 405,
 87,
 465,
 559,
 247,
 307,
 463,
 366,
 395,
 224,
 596,
 622,
 435,
 73,
 19,
 488

In [None]:
import ast

info

Unnamed: 0,name,n_samples,n_features,imbalance,categoricals,n_categoricals,train_indices,test_indices
0,blood,748,4,0.237968,[],0,"[[255, 174, 420, 341, 186, 512, 635, 190, 450,...","[[343, 644, 607, 22, 82, 105, 681, 292, 451, 5..."
1,clima,540,18,0.085185,[],0,"[[409, 132, 372, 52, 215, 156, 410, 76, 128, 2...","[[495, 424, 200, 281, 221, 500, 81, 515, 432, ..."
2,eeg,14980,14,0.448798,[],0,"[[4155, 5750, 12565, 7790, 12551, 963, 14035, ...","[[5237, 10277, 9601, 1187, 5396, 7470, 10820, ..."
3,fertility,100,9,0.12,"['season', 'child_diseases', 'accident', 'surg...",7,"[[61, 5, 1, 70, 24, 40, 63, 52, 91, 99, 32, 47...","[[74, 46, 2, 33, 82, 25, 26, 11, 90, 15, 71, 8..."
4,gina,3153,970,0.491595,[],0,"[[2470, 503, 112, 1455, 1167, 1112, 2227, 1308...","[[49, 918, 536, 2682, 1079, 2440, 1168, 2148, ..."
5,hill,1212,100,0.5,[],0,"[[113, 547, 693, 906, 753, 537, 979, 803, 169,...","[[331, 231, 1044, 85, 882, 835, 155, 570, 253,..."
6,ilpd,583,10,0.286449,[],0,"[[213, 76, 343, 434, 44, 226, 532, 70, 493, 56...","[[208, 273, 442, 479, 478, 319, 224, 522, 474,..."
7,kc,2109,21,0.154576,[],0,"[[571, 2078, 248, 1348, 549, 603, 1085, 466, 1...","[[81, 968, 757, 193, 823, 1191, 1079, 1858, 17..."
8,liver,345,6,0.42029,[],0,"[[194, 281, 228, 140, 78, 40, 302, 285, 43, 15...","[[130, 52, 238, 125, 249, 204, 257, 307, 4, 79..."
9,musk,476,166,0.434874,[],0,"[[246, 107, 32, 180, 92, 46, 319, 132, 396, 14...","[[442, 423, 363, 43, 237, 407, 283, 271, 244, ..."


[495,
 424,
 200,
 281,
 221,
 500,
 81,
 515,
 432,
 488,
 483,
 245,
 54,
 220,
 62,
 4,
 427,
 306,
 117,
 366,
 38,
 213,
 158,
 293,
 349,
 70,
 174,
 32,
 474,
 492,
 477,
 172,
 228,
 328,
 536,
 187,
 289,
 292,
 294,
 17,
 129,
 371,
 123,
 333,
 74,
 84,
 368,
 167,
 364,
 118,
 264,
 260,
 531,
 145,
 262,
 8,
 148,
 251,
 286,
 22,
 170,
 501,
 433,
 447,
 514,
 451,
 171,
 280,
 282,
 436,
 533,
 241,
 206,
 473,
 452,
 3,
 407,
 144,
 240,
 107,
 386,
 309,
 136,
 175,
 332,
 11,
 2,
 413,
 161,
 443,
 471,
 157,
 151,
 83,
 56,
 351,
 24,
 405,
 98,
 340,
 417,
 494,
 428,
 312,
 370,
 265,
 141,
 86,
 105,
 234,
 166,
 290,
 345,
 227,
 394,
 268,
 279,
 319,
 532,
 392,
 39,
 79,
 523,
 46,
 348,
 311,
 97,
 440,
 78,
 133,
 510,
 387,
 402,
 291,
 430,
 516,
 27,
 421,
 53,
 382,
 163,
 160,
 353,
 266,
 168,
 225,
 85,
 391,
 93,
 28,
 479,
 217,
 434,
 489,
 66,
 235,
 63,
 82,
 310,
 520,
 298,
 369]

In [None]:
def 