# Feature selection and data splitting for mds data 

In [1]:
%load_ext autoreload
%autoreload 2

import polars as pl

from src.data_preprocessing import OmicDataSplitter

In [3]:
y = pl.read_csv("mds_data/preprocessed_74/annotations_74.csv")

y = (
    y.select("SAMPLE_ID", "disease")
    .rename({"SAMPLE_ID": "sample_ids", "disease": "class"})
    .with_columns(pl.col("class") - 1)
)
y

sample_ids,class
str,i64
"""N54""",0
"""N58""",0
"""N60""",0
"""N70""",0
"""N82""",0
…,…
"""V806""",1
"""V839""",1
"""V883""",1
"""V888""",1


In [4]:
mrna = pl.read_csv("mds_data/preprocessed_74/mrna.csv")

OmicDataSplitter(
    df=mrna,
    annotation_cols=["GENE_ID", "GENE_NAME"],
    y_df=y,
    n_features=200,
    n_splits=5,
    random_state=3,
    output_dir="mds_data/splits_74/mrna",
).process_data()

100%|██████████| 200/200 [00:22<00:00,  8.72it/s]?fold/s]
100%|██████████| 200/200 [00:22<00:00,  8.72it/s]39, 24.77s/fold]
100%|██████████| 200/200 [00:23<00:00,  8.63it/s]12, 24.02s/fold]
100%|██████████| 200/200 [00:22<00:00,  8.75it/s]47, 23.87s/fold]
100%|██████████| 200/200 [00:22<00:00,  8.82it/s]23, 23.66s/fold]
Processing folds: 100%|██████████| 5/5 [01:58<00:00, 23.70s/fold]


In [5]:
mirna = pl.read_csv("mds_data/preprocessed_74/mirna_genes.csv")

OmicDataSplitter(
    df=mirna,
    annotation_cols=["GENE_ID", "GENE_NAME"],
    y_df=y,
    n_features=200,
    n_splits=5,
    random_state=3,
    output_dir="mds_data/splits_74/mirna_genes",
).process_data()

100%|██████████| 200/200 [00:04<00:00, 48.49it/s]?fold/s]
100%|██████████| 200/200 [00:04<00:00, 49.30it/s]16,  4.22s/fold]
100%|██████████| 200/200 [00:04<00:00, 48.67it/s]12,  4.17s/fold]
100%|██████████| 200/200 [00:04<00:00, 47.95it/s]08,  4.18s/fold]
100%|██████████| 200/200 [00:04<00:00, 48.95it/s]04,  4.21s/fold]
Processing folds: 100%|██████████| 5/5 [00:20<00:00,  4.19s/fold]


In [None]:
mirna = pl.read_csv("mds_data/preprocessed_74/mirna.csv")

OmicDataSplitter(
    df=mirna,
    annotation_cols=["miRNA"],
    y_df=y,
    n_features=200,
    n_splits=5,
    random_state=3,
    output_dir="mds_data/splits_74/mirna",
).process_data()

In [None]:
circrna = pl.read_csv("mds_data/preprocessed_74/circrna.csv")

# fill the null ids with unknown id n -> this should be added to the preprocessing before
circrna_ids = circrna["circRNA_ID"]
unknown_id = 0
for i in range(len(circrna_ids)):
    if not circrna_ids[i]:
        circrna_ids[i] = f"unknown_id_{unknown_id}"
        unknown_id += 1
circrna = circrna.with_columns(pl.Series("circRNA_ID", circrna_ids))

ods = OmicDataSplitter(
    df=circrna,
    annotation_cols=["circRNA_ID", "GENE_ID", "GENE_TYPE", "GENE_NAME", "STRAND"],
    y_df=y,
    n_features=200,
    n_splits=5,
    random_state=3,
    output_dir="mds_data/splits_74/circrna",
)
ods.process_data()

100%|██████████| 200/200 [00:06<00:00, 29.09it/s]?fold/s]
100%|██████████| 200/200 [00:06<00:00, 29.01it/s]28,  7.03s/fold]
100%|██████████| 200/200 [00:06<00:00, 29.35it/s]21,  7.02s/fold]
100%|██████████| 200/200 [00:06<00:00, 29.34it/s]13,  6.99s/fold]
100%|██████████| 200/200 [00:06<00:00, 29.16it/s]06,  6.97s/fold]
Processing folds: 100%|██████████| 5/5 [00:34<00:00,  6.98s/fold]


In [51]:
pirna = pl.read_csv("mds_data/preprocessed/pirna.csv")

OmicDataSplitter(
    df=pirna,
    annotation_cols=["piRNA_name", "piRNA_id"],
    y_df=y,
    n_features=150,
    n_splits=5,
    random_state=3,
    output_dir="mds_data/splits/pirna",
).process_data()

[[ 8.45542275  5.56947475  5.2958623  ...  6.67494918 18.6346163
  11.93145659]
 [ 9.37331964  7.20165321  7.32284885 ...  6.70323076 18.91790228
  10.79516624]
 [ 9.0753424   6.19631102  6.1150057  ...  6.66472286 18.63644873
  11.38865747]
 ...
 [ 8.13573063  6.29115214  7.41039954 ...  6.92836094 18.59323893
  11.52812814]
 [ 8.93068584  6.59591563  5.5479645  ...  7.58843909 18.6691824
  11.87951857]
 [12.19174384  7.59227334  8.33926107 ...  8.03773732 17.56803802
  13.79078395]]
['hsa_piR_020485', 'hsa_piR_013624', 'hsa_piR_008488', 'hsa_piR_014629', 'hsa_piR_020008', 'hsa_piR_000651', 'hsa_piR_001184', 'hsa_piR_007635', 'hsa_piR_000775', 'hsa_piR_001318', 'hsa_piR_002485', 'hsa_piR_020381', 'hsa_piR_016926', 'hsa_piR_018849', 'hsa_piR_015249', 'hsa_piR_019675', 'hsa_piR_000753', 'hsa_piR_014620', 'hsa_piR_018573', 'hsa_piR_019324', 'hsa_piR_001169', 'hsa_piR_016659', 'hsa_piR_001356', 'hsa_piR_016735', 'hsa_piR_020490', 'hsa_piR_019420', 'hsa_piR_014923', 'hsa_piR_000552', 'hsa_

100%|██████████| 150/150 [00:02<00:00, 62.37it/s]?fold/s]
100%|██████████| 150/150 [00:02<00:00, 60.31it/s]09,  2.46s/fold]
100%|██████████| 150/150 [00:02<00:00, 62.74it/s]07,  2.50s/fold]
100%|██████████| 150/150 [00:02<00:00, 62.19it/s]04,  2.47s/fold]
100%|██████████| 150/150 [00:02<00:00, 61.68it/s]02,  2.47s/fold]
Processing folds: 100%|██████████| 5/5 [00:12<00:00,  2.47s/fold]


In [7]:
te = pl.read_csv("mds_data/preprocessed_74/te_counts.csv")
te

OmicDataSplitter(
    df=te,
    annotation_cols=["TE"],
    y_df=y,
    n_features=200,
    n_splits=5,
    random_state=3,
    output_dir="mds_data/splits_74/te_counts",
).process_data()

100%|██████████| 200/200 [00:06<00:00, 31.84it/s]?fold/s]
100%|██████████| 200/200 [00:06<00:00, 31.61it/s]25,  6.42s/fold]
100%|██████████| 200/200 [00:06<00:00, 31.54it/s]19,  6.44s/fold]
100%|██████████| 200/200 [00:06<00:00, 31.90it/s]12,  6.45s/fold]
100%|██████████| 200/200 [00:06<00:00, 31.23it/s]06,  6.42s/fold]
Processing folds: 100%|██████████| 5/5 [00:32<00:00,  6.44s/fold]
