# Feature selection and data splitting for mds data 

In [26]:
%load_ext autoreload
%autoreload 2

import numpy as np
import polars as pl
from sklearn.model_selection import StratifiedKFold

from src.data_preprocessing import OmicDataSplitter

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
y = pl.read_csv("mds_data/preprocessed/annotations_66.csv")

y = (
    y.select("SAMPLE_ID", "disease")
    .rename({"SAMPLE_ID": "sample_ids", "disease": "class"})
    .with_columns(pl.col("class") - 1)
)
y

sample_ids,class
str,i64
"""N58""",0
"""N60""",0
"""N70""",0
"""N82""",0
"""N83""",0
…,…
"""V806""",1
"""V839""",1
"""V883""",1
"""V888""",1


In [3]:
mrna = pl.read_csv("mds_data/preprocessed/mrna.csv")

OmicDataSplitter(
    df=mrna,
    annotation_cols=["GENE_ID", "GENE_NAME"],
    y_df=y,
    n_features=200,
    n_splits=5,
    random_state=3,
    output_dir="mds_data/splits/mrna",
).process_data()

GENE_ID,GENE_NAME,N58,N60,N70,N82,N83,N84,N85,V1048,V108,V1090,V125,V1279,V1297,V1321,V1337,V1394,V1441,V1456,V148,V1505,V1528,V1565,V1591,V1592,V1699,V1708,V1742,V1776,V1788,V18,V1800,V1823,V1834,V1857,V1860,V1874,V1884,V1920,V1921,V2089,V2092,V2110,V2133,V221,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V712,V714,V716,V777,V806,V839,V883,V888,V940
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""ENSG00000103184""","""SEC14L5""",4.574155,7.288239,3.561246,2.897224,3.615696,4.057362,2.203009,4.398603,3.467058,3.657675,4.619424,1.507977,4.809069,4.749144,2.146183,2.66225,3.555635,0.714897,2.010324,6.268556,5.914118,4.018251,6.781959,4.900504,3.481129,3.767197,5.167452,1.631362,5.933825,4.945851,0.444571,2.734007,4.452827,5.877369,6.07671,3.343197,6.156233,2.993291,5.727105,1.36271,5.001145,5.516594,6.915722,3.82929,3.319966,6.938185,5.118215,1.629126,5.885573,5.003287,2.039422,10.53412,2.297346,4.097657,3.891976,4.604959,5.844351,3.188559,1.986391,2.357737,3.549326,5.252131,3.976603,5.670529,4.456624,4.897539
"""ENSG00000119862""","""LGALSL""",7.455039,8.479401,6.306391,3.321911,6.188541,5.979059,5.223318,6.5576,6.140982,7.953992,6.599124,5.548295,8.013039,7.647353,4.956439,6.928581,5.090716,6.687677,5.006865,9.311978,8.342994,6.413821,9.084263,6.776719,5.304247,6.598114,6.798145,4.599317,8.505406,6.422409,2.381453,6.143426,5.936016,8.052438,7.556633,6.799104,7.551962,6.284811,7.092467,3.705927,6.91619,6.314894,6.214188,5.860457,5.514356,8.538408,7.514117,6.082184,9.708378,6.616514,4.57679,12.548,5.836737,6.590203,6.043704,7.598248,7.729715,6.108103,5.62265,5.729892,5.977239,6.672379,6.838144,7.450425,6.055094,6.88753
"""ENSG00000115461""","""IGFBP5""",1.531313,4.93984,1.589817,3.596918,4.221713,5.694364,4.633449,7.55643,2.906119,6.264272,4.300236,4.642322,5.192593,5.82839,3.354076,6.16894,3.134675,4.718852,8.624212,3.38863,3.416752,8.240399,6.652194,7.699936,4.466548,3.895035,8.655896,4.393156,3.639881,5.553176,2.653826,6.227905,6.201601,9.962816,5.073338,4.806445,4.322065,3.59729,4.244579,4.54983,5.835025,6.82944,7.039622,4.689163,3.702583,6.163571,2.613793,2.308573,3.431658,4.664411,3.645008,4.07661,4.614307,5.113326,2.241138,3.67498,11.902128,2.718183,2.048483,1.477194,4.562139,6.748317,2.710774,4.358541,5.20171,6.416449
"""ENSG00000137801""","""THBS1""",10.787846,12.452547,9.390428,7.256898,9.546596,9.807684,8.395226,10.564428,9.994399,10.93104,10.041629,8.696442,11.235067,10.286238,7.73148,9.966976,10.189488,11.291271,8.163448,12.26609,11.428713,9.155845,12.369392,8.678094,8.537479,11.617824,10.461017,7.391252,11.719449,8.622247,6.537787,9.79737,9.035041,11.235494,11.164126,9.850056,10.290641,7.654049,11.137592,4.781831,10.478594,10.905235,9.504674,9.022241,8.020058,12.340003,9.573093,9.417571,11.567134,10.372583,7.060916,15.367104,8.483557,10.596553,8.780714,9.818366,11.521034,6.721186,6.61051,9.027454,7.719915,10.210917,10.274893,10.805033,10.464737,10.704895
"""ENSG00000173083""","""HPSE""",6.935416,7.987946,7.20935,5.125137,6.211795,5.990862,5.462574,6.413457,7.825867,8.089623,6.922915,6.288086,7.527803,6.888642,6.086392,6.349528,6.562359,6.139168,6.209023,8.389047,7.645464,6.792004,8.256308,6.635347,6.015047,6.27286,6.943379,5.713779,8.075317,5.712303,5.481236,6.143426,6.635867,6.400915,7.285251,6.45702,8.3526,5.8538,7.281664,5.741307,6.923299,7.189791,7.09444,6.704163,4.665372,8.178149,6.607652,6.537152,7.83118,7.217909,6.983234,11.844895,5.8875,8.039698,6.089552,6.931796,7.643238,5.951342,5.730754,6.129718,7.313044,6.989801,7.451146,6.940394,6.860065,6.735584
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ENSG00000100320""","""RBFOX2""",8.22931,8.577745,8.793937,8.146168,8.50636,8.094082,8.412581,6.262423,8.824019,9.092984,8.123152,8.145417,8.134884,7.165401,7.635301,7.611847,8.661839,5.259503,7.394901,7.761067,7.65251,8.083627,6.969916,9.355596,6.342115,8.242365,6.032863,6.856847,7.337829,7.598508,5.508181,6.09535,8.075045,8.245913,7.133615,8.494605,7.07594,8.600279,8.234299,6.944529,6.121303,7.042001,7.561364,8.72022,6.691154,7.412463,5.706284,8.655129,8.009674,7.341117,2.289394,6.340976,6.346199,7.8873,7.275199,9.039884,7.44935,9.180781,8.788654,8.775473,9.47642,6.869862,7.52732,7.067754,7.545361,6.655649
"""ENSG00000131037""","""EPS8L1""",4.921367,5.092288,4.636338,4.303763,3.558799,4.273777,4.539356,3.657559,4.76519,5.2067,5.129906,5.249289,5.093011,4.608039,4.105194,3.421264,4.543386,4.570978,4.012894,4.479922,4.122518,5.584509,3.767809,3.859496,3.894477,4.436839,4.401572,4.939957,3.310004,4.954976,3.038821,3.685049,5.122019,5.154102,4.704891,4.004908,2.594864,5.67405,4.176327,3.059096,4.431902,2.579029,5.850097,4.698257,4.024754,5.202417,3.982463,6.590354,4.603486,2.425481,3.861162,4.223547,2.009216,3.985339,2.241138,5.239908,5.761738,3.852376,5.114053,5.401104,4.54089,4.377534,4.054711,4.791207,4.437371,4.463407
"""ENSG00000147138""","""GPR174""",7.472695,7.693039,8.470779,8.431479,8.423135,7.941875,7.586384,8.35664,10.20575,9.02655,8.031976,8.900432,9.261677,9.425081,9.225934,9.444781,8.44918,7.687412,8.051833,7.811535,8.339178,7.633405,7.297222,8.87963,7.210158,7.914855,6.834074,8.695152,4.52869,7.733878,10.22048,6.504612,9.587979,7.97627,8.012537,8.975441,8.739155,9.705377,8.709629,8.063136,7.39922,7.799343,7.265433,8.571631,9.109058,7.41099,4.540132,10.158007,8.317188,9.034955,8.597539,6.449957,7.772009,8.241796,7.973199,9.134118,8.364018,8.292342,8.913818,9.206939,9.270765,7.319049,8.83835,7.851796,7.858072,6.91692
"""ENSG00000152767""","""FARP1""",6.940183,7.49388,6.46252,5.675233,6.4785,6.855844,5.77485,7.895461,7.102404,5.689908,6.759296,6.770667,7.110603,7.269297,7.513641,6.432986,6.543011,6.975591,7.973696,6.347298,6.180977,7.222779,6.675958,6.867061,6.708219,7.062017,7.460929,6.16691,7.521092,7.119538,7.263643,8.050985,7.734846,9.232917,7.693798,6.712544,7.099249,6.624518,6.440258,7.417438,8.285402,7.647037,8.174316,6.902123,5.938316,8.029102,6.648249,7.433624,6.519701,6.396188,5.661561,6.111841,6.986761,7.610686,6.858305,7.330748,7.51952,6.998061,6.492616,6.120499,6.428605,7.674356,6.001296,6.299561,7.777429,6.503437


In [None]:
mirna = pl.read_csv("mds_data/preprocessed/mirna.csv")

OmicDataSplitter(
    df=mirna,
    annotation_cols=["miRNA"],
    y_df=y,
    n_features=200,
    n_splits=5,
    random_state=3,
    output_dir="mds_data/splits/mirna",
).process_data()

In [48]:
circrna = pl.read_csv("mds_data/preprocessed/circrna.csv")

# fill the null ids with unknown id n -> this should be added to the preprocessing before
circrna_ids = circrna["circRNA_ID"]
unknown_id = 0
for i in range(len(circrna_ids)):
    if circrna_ids[i] == None:
        circrna_ids[i] = f"unknown_id_{unknown_id}"
        unknown_id += 1
circrna = circrna.with_columns(
    pl.Series("circRNA_ID", circrna_ids)
)

ods = OmicDataSplitter(
    df=circrna,
    annotation_cols=["circRNA_ID", "GENE_ID", "GENE_TYPE", "GENE_NAME", "STRAND"],
    y_df=y,
    n_features=200,
    n_splits=5,
    random_state=3,
    output_dir="mds_data/splits/circrna",
)
ods.process_data()

[[ 8.45024723 11.30466803 10.53455241 ... 11.09063447 10.44715036
  11.20161678]
 [ 9.50236005 10.86371399 10.15371322 ... 10.50136511 10.39452646
  11.01563966]
 [ 9.2735342  11.56935656  9.2735342  ... 11.30152024  9.77534875
  11.75274602]
 ...
 [10.49776474 11.71958689 10.23492987 ... 11.23433117 10.72001461
  11.99961931]
 [10.0575871  11.16377689 10.52669629 ... 11.16377689 10.88012063
  12.00000144]
 [ 9.17708977 10.63493452  8.76288279 ... 11.34507503  9.98337618
  12.12911376]]
['hsa_circ_0001387', 'hsa_circ_0002484', 'hsa_circ_0007848', 'hsa_circ_0002711', 'hsa_circ_0000551', 'hsa_circ_0005542', 'hsa_circ_0004658', 'unknown_id_0', 'hsa_circ_0005881', 'hsa_circ_0001776', 'hsa_circ_0000211', 'hsa_circ_0002468', 'unknown_id_1', 'hsa_circ_0071185', 'unknown_id_2', 'unknown_id_3', 'hsa_circ_0004524', 'hsa_circ_0004771', 'hsa_circ_0000607', 'hsa_circ_0046760', 'hsa_circ_0001006', 'hsa_circ_0001333', 'unknown_id_4', 'hsa_circ_0019170', 'hsa_circ_0004058', 'hsa_circ_0071174', 'hsa_ci

100%|██████████| 200/200 [00:08<00:00, 23.62it/s]?fold/s]
100%|██████████| 200/200 [00:06<00:00, 29.23it/s]41, 10.42s/fold]
100%|██████████| 200/200 [00:06<00:00, 29.17it/s]25,  8.62s/fold]
100%|██████████| 200/200 [00:06<00:00, 29.16it/s]15,  7.87s/fold]
100%|██████████| 200/200 [00:06<00:00, 29.17it/s]07,  7.53s/fold]
Processing folds: 100%|██████████| 5/5 [00:38<00:00,  7.75s/fold]


In [51]:
pirna = pl.read_csv("mds_data/preprocessed/pirna.csv")

OmicDataSplitter(
    df=pirna,
    annotation_cols=["piRNA_name", "piRNA_id"],
    y_df=y,
    n_features=150,
    n_splits=5,
    random_state=3,
    output_dir="mds_data/splits/pirna",
).process_data()

[[ 8.45542275  5.56947475  5.2958623  ...  6.67494918 18.6346163
  11.93145659]
 [ 9.37331964  7.20165321  7.32284885 ...  6.70323076 18.91790228
  10.79516624]
 [ 9.0753424   6.19631102  6.1150057  ...  6.66472286 18.63644873
  11.38865747]
 ...
 [ 8.13573063  6.29115214  7.41039954 ...  6.92836094 18.59323893
  11.52812814]
 [ 8.93068584  6.59591563  5.5479645  ...  7.58843909 18.6691824
  11.87951857]
 [12.19174384  7.59227334  8.33926107 ...  8.03773732 17.56803802
  13.79078395]]
['hsa_piR_020485', 'hsa_piR_013624', 'hsa_piR_008488', 'hsa_piR_014629', 'hsa_piR_020008', 'hsa_piR_000651', 'hsa_piR_001184', 'hsa_piR_007635', 'hsa_piR_000775', 'hsa_piR_001318', 'hsa_piR_002485', 'hsa_piR_020381', 'hsa_piR_016926', 'hsa_piR_018849', 'hsa_piR_015249', 'hsa_piR_019675', 'hsa_piR_000753', 'hsa_piR_014620', 'hsa_piR_018573', 'hsa_piR_019324', 'hsa_piR_001169', 'hsa_piR_016659', 'hsa_piR_001356', 'hsa_piR_016735', 'hsa_piR_020490', 'hsa_piR_019420', 'hsa_piR_014923', 'hsa_piR_000552', 'hsa_

100%|██████████| 150/150 [00:02<00:00, 62.37it/s]?fold/s]
100%|██████████| 150/150 [00:02<00:00, 60.31it/s]09,  2.46s/fold]
100%|██████████| 150/150 [00:02<00:00, 62.74it/s]07,  2.50s/fold]
100%|██████████| 150/150 [00:02<00:00, 62.19it/s]04,  2.47s/fold]
100%|██████████| 150/150 [00:02<00:00, 61.68it/s]02,  2.47s/fold]
Processing folds: 100%|██████████| 5/5 [00:12<00:00,  2.47s/fold]


In [54]:
te = pl.read_csv("mds_data/preprocessed/te_counts.csv")
te

OmicDataSplitter(
    df=te,
    annotation_cols=["TE"],
    y_df=y,
    n_features=200,
    n_splits=5,
    random_state=3,
    output_dir="mds_data/splits/te_counts",
).process_data()

[[2.58399931 1.74808202 3.4490177  ... 5.71199936 8.2671351  6.29611433]
 [3.92374007 1.88044366 2.88811289 ... 6.01272797 8.28253309 6.55077555]
 [3.38226427 2.52253091 2.07551866 ... 5.13320406 7.79503833 5.70678332]
 ...
 [4.01368942 1.69052926 4.11576908 ... 5.67338819 8.26572606 6.06283089]
 [3.59403366 1.99370296 2.99137698 ... 5.23805625 7.96139047 6.11003081]
 [3.26458651 1.29254843 4.10318847 ... 5.60539548 8.3513269  6.43936641]]
['HERV-K14CI', 'MER67D', 'MIR3', 'HERVK11DI', 'MER41G', 'AluYd2', 'LTR43_I', 'MER57F', 'LTR43B', 'LTR47B2', 'LTR59', 'L1P4c_5end', 'HERVH', 'LTR7A', 'MER92B', 'LTR1C3', 'LTR62', 'ERV3-16A3_I', 'LTR30', 'LTR44', 'MER66B', 'LTR38A1', 'LTR38', 'MLT2F', 'AluYe2', 'LTR1B0', 'PABL_B', 'MER89', 'LTR1F1', 'LTR77', 'LTR38B', 'MER57C2', 'LTR26B', 'LTR34', 'LOR1b_LTR', 'MER50C', 'LTR64', 'LTR26E', 'MER50B', 'L2B', 'LTR22', 'MER34A', 'MER57C1', 'MER34', 'LTR1C1', 'MLT1E', 'HERVK13I', 'MLT1H2', 'MER51D', 'LTR60B', 'MER66D', 'LTR1B', 'L1M3C_5', 'MER101B', 'AluSx1'

100%|██████████| 200/200 [00:06<00:00, 31.54it/s]?fold/s]
100%|██████████| 200/200 [00:06<00:00, 30.80it/s]25,  6.47s/fold]
100%|██████████| 200/200 [00:06<00:00, 30.85it/s]19,  6.54s/fold]
100%|██████████| 200/200 [00:06<00:00, 31.38it/s]13,  6.57s/fold]
100%|██████████| 200/200 [00:06<00:00, 31.53it/s]06,  6.53s/fold]
Processing folds: 100%|██████████| 5/5 [00:32<00:00,  6.52s/fold]


In [17]:
A = np.eye(3)
A[0,0] = 2
A[0,1] = 3
print(A)
A.max(axis=0)

[[2. 3. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


array([2., 3., 1.])

In [21]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)

list(skf.split(np.zeros(len(y)), y["class"]))

[(array([ 0,  1,  2,  3,  6,  8,  9, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21,
         23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 37, 38, 39, 40, 41, 43,
         45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63,
         64]),
  array([ 4,  5,  7, 11, 12, 22, 24, 34, 36, 42, 44, 46, 62, 65])),
 (array([ 0,  1,  2,  4,  5,  7,  8, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21,
         22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34, 35, 36, 37, 38, 39, 40,
         41, 42, 43, 44, 45, 46, 48, 50, 52, 53, 54, 56, 57, 58, 59, 60, 61,
         62, 65]),
  array([ 3,  6,  9, 10, 15, 31, 32, 47, 49, 51, 55, 63, 64])),
 (array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 18,
         22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 40, 42, 43,
         44, 45, 46, 47, 48, 49, 51, 52, 54, 55, 57, 58, 59, 60, 61, 62, 63,
         64, 65]),
  array([ 0, 17, 19, 20, 21, 23, 37, 38, 39, 41, 50, 53, 56])),
 (array([ 0,  1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 1

In [29]:
from src.data_splitting import OmicDataSplitter

ods = OmicDataSplitter(y, n_splits=5, random_state=3)

In [28]:
mrna = pl.read_csv("mds_data/preprocessed/mrna.csv")
mrna

GENE_ID,GENE_NAME,N58,N60,N70,N82,N83,N84,N85,V1048,V108,V1090,V125,V1279,V1297,V1321,V1337,V1394,V1441,V1456,V148,V1505,V1528,V1565,V1591,V1592,V1699,V1708,V1742,V1776,V1788,V18,V1800,V1823,V1834,V1857,V1860,V1874,V1884,V1920,V1921,V2089,V2092,V2110,V2133,V221,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V712,V714,V716,V777,V806,V839,V883,V888,V940
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""ENSG00000103184""","""SEC14L5""",4.574155,7.288239,3.561246,2.897224,3.615696,4.057362,2.203009,4.398603,3.467058,3.657675,4.619424,1.507977,4.809069,4.749144,2.146183,2.66225,3.555635,0.714897,2.010324,6.268556,5.914118,4.018251,6.781959,4.900504,3.481129,3.767197,5.167452,1.631362,5.933825,4.945851,0.444571,2.734007,4.452827,5.877369,6.07671,3.343197,6.156233,2.993291,5.727105,1.36271,5.001145,5.516594,6.915722,3.82929,3.319966,6.938185,5.118215,1.629126,5.885573,5.003287,2.039422,10.53412,2.297346,4.097657,3.891976,4.604959,5.844351,3.188559,1.986391,2.357737,3.549326,5.252131,3.976603,5.670529,4.456624,4.897539
"""ENSG00000119862""","""LGALSL""",7.455039,8.479401,6.306391,3.321911,6.188541,5.979059,5.223318,6.5576,6.140982,7.953992,6.599124,5.548295,8.013039,7.647353,4.956439,6.928581,5.090716,6.687677,5.006865,9.311978,8.342994,6.413821,9.084263,6.776719,5.304247,6.598114,6.798145,4.599317,8.505406,6.422409,2.381453,6.143426,5.936016,8.052438,7.556633,6.799104,7.551962,6.284811,7.092467,3.705927,6.91619,6.314894,6.214188,5.860457,5.514356,8.538408,7.514117,6.082184,9.708378,6.616514,4.57679,12.548,5.836737,6.590203,6.043704,7.598248,7.729715,6.108103,5.62265,5.729892,5.977239,6.672379,6.838144,7.450425,6.055094,6.88753
"""ENSG00000115461""","""IGFBP5""",1.531313,4.93984,1.589817,3.596918,4.221713,5.694364,4.633449,7.55643,2.906119,6.264272,4.300236,4.642322,5.192593,5.82839,3.354076,6.16894,3.134675,4.718852,8.624212,3.38863,3.416752,8.240399,6.652194,7.699936,4.466548,3.895035,8.655896,4.393156,3.639881,5.553176,2.653826,6.227905,6.201601,9.962816,5.073338,4.806445,4.322065,3.59729,4.244579,4.54983,5.835025,6.82944,7.039622,4.689163,3.702583,6.163571,2.613793,2.308573,3.431658,4.664411,3.645008,4.07661,4.614307,5.113326,2.241138,3.67498,11.902128,2.718183,2.048483,1.477194,4.562139,6.748317,2.710774,4.358541,5.20171,6.416449
"""ENSG00000137801""","""THBS1""",10.787846,12.452547,9.390428,7.256898,9.546596,9.807684,8.395226,10.564428,9.994399,10.93104,10.041629,8.696442,11.235067,10.286238,7.73148,9.966976,10.189488,11.291271,8.163448,12.26609,11.428713,9.155845,12.369392,8.678094,8.537479,11.617824,10.461017,7.391252,11.719449,8.622247,6.537787,9.79737,9.035041,11.235494,11.164126,9.850056,10.290641,7.654049,11.137592,4.781831,10.478594,10.905235,9.504674,9.022241,8.020058,12.340003,9.573093,9.417571,11.567134,10.372583,7.060916,15.367104,8.483557,10.596553,8.780714,9.818366,11.521034,6.721186,6.61051,9.027454,7.719915,10.210917,10.274893,10.805033,10.464737,10.704895
"""ENSG00000173083""","""HPSE""",6.935416,7.987946,7.20935,5.125137,6.211795,5.990862,5.462574,6.413457,7.825867,8.089623,6.922915,6.288086,7.527803,6.888642,6.086392,6.349528,6.562359,6.139168,6.209023,8.389047,7.645464,6.792004,8.256308,6.635347,6.015047,6.27286,6.943379,5.713779,8.075317,5.712303,5.481236,6.143426,6.635867,6.400915,7.285251,6.45702,8.3526,5.8538,7.281664,5.741307,6.923299,7.189791,7.09444,6.704163,4.665372,8.178149,6.607652,6.537152,7.83118,7.217909,6.983234,11.844895,5.8875,8.039698,6.089552,6.931796,7.643238,5.951342,5.730754,6.129718,7.313044,6.989801,7.451146,6.940394,6.860065,6.735584
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ENSG00000100320""","""RBFOX2""",8.22931,8.577745,8.793937,8.146168,8.50636,8.094082,8.412581,6.262423,8.824019,9.092984,8.123152,8.145417,8.134884,7.165401,7.635301,7.611847,8.661839,5.259503,7.394901,7.761067,7.65251,8.083627,6.969916,9.355596,6.342115,8.242365,6.032863,6.856847,7.337829,7.598508,5.508181,6.09535,8.075045,8.245913,7.133615,8.494605,7.07594,8.600279,8.234299,6.944529,6.121303,7.042001,7.561364,8.72022,6.691154,7.412463,5.706284,8.655129,8.009674,7.341117,2.289394,6.340976,6.346199,7.8873,7.275199,9.039884,7.44935,9.180781,8.788654,8.775473,9.47642,6.869862,7.52732,7.067754,7.545361,6.655649
"""ENSG00000131037""","""EPS8L1""",4.921367,5.092288,4.636338,4.303763,3.558799,4.273777,4.539356,3.657559,4.76519,5.2067,5.129906,5.249289,5.093011,4.608039,4.105194,3.421264,4.543386,4.570978,4.012894,4.479922,4.122518,5.584509,3.767809,3.859496,3.894477,4.436839,4.401572,4.939957,3.310004,4.954976,3.038821,3.685049,5.122019,5.154102,4.704891,4.004908,2.594864,5.67405,4.176327,3.059096,4.431902,2.579029,5.850097,4.698257,4.024754,5.202417,3.982463,6.590354,4.603486,2.425481,3.861162,4.223547,2.009216,3.985339,2.241138,5.239908,5.761738,3.852376,5.114053,5.401104,4.54089,4.377534,4.054711,4.791207,4.437371,4.463407
"""ENSG00000147138""","""GPR174""",7.472695,7.693039,8.470779,8.431479,8.423135,7.941875,7.586384,8.35664,10.20575,9.02655,8.031976,8.900432,9.261677,9.425081,9.225934,9.444781,8.44918,7.687412,8.051833,7.811535,8.339178,7.633405,7.297222,8.87963,7.210158,7.914855,6.834074,8.695152,4.52869,7.733878,10.22048,6.504612,9.587979,7.97627,8.012537,8.975441,8.739155,9.705377,8.709629,8.063136,7.39922,7.799343,7.265433,8.571631,9.109058,7.41099,4.540132,10.158007,8.317188,9.034955,8.597539,6.449957,7.772009,8.241796,7.973199,9.134118,8.364018,8.292342,8.913818,9.206939,9.270765,7.319049,8.83835,7.851796,7.858072,6.91692
"""ENSG00000152767""","""FARP1""",6.940183,7.49388,6.46252,5.675233,6.4785,6.855844,5.77485,7.895461,7.102404,5.689908,6.759296,6.770667,7.110603,7.269297,7.513641,6.432986,6.543011,6.975591,7.973696,6.347298,6.180977,7.222779,6.675958,6.867061,6.708219,7.062017,7.460929,6.16691,7.521092,7.119538,7.263643,8.050985,7.734846,9.232917,7.693798,6.712544,7.099249,6.624518,6.440258,7.417438,8.285402,7.647037,8.174316,6.902123,5.938316,8.029102,6.648249,7.433624,6.519701,6.396188,5.661561,6.111841,6.986761,7.610686,6.858305,7.330748,7.51952,6.998061,6.492616,6.120499,6.428605,7.674356,6.001296,6.299561,7.777429,6.503437


In [30]:
ods.process_data(
    mrna,
    "mds_data/splits/mrna",
    annotation_cols=["GENE_ID", "GENE_NAME"],
    normalization="minmax",
    feature_selection="mrmr",
    n_features=50,
)

TypeError: DataFrame.__init__() got an unexpected keyword argument 'columns'