# Feature selection and data splitting for mds data 

In [1]:
%load_ext autoreload
%autoreload 2

import polars as pl

from src.data_preprocessing import OmicDataSplitter

In [31]:
y = pl.read_csv("mds_data/preprocessed_74/annotations_74.csv")

# y = (
#     y.select("SAMPLE_ID", "disease")
#     .rename({"SAMPLE_ID": "sample_ids", "disease": "class"})
#     .with_columns(pl.col("class") - 1)
# )
# y = (
#     y.select("SAMPLE_ID", "risk")
#     .rename({"SAMPLE_ID": "sample_ids", "risk": "class"})
#     .with_columns(pl.col("class") - 1)
#     .filter(pl.col("class") != -1)
# )
y = (
    y.select("SAMPLE_ID", "mutation")
    .rename({"SAMPLE_ID": "sample_ids", "mutation": "class"})
    .with_columns(pl.col("class") - 1)
    .filter(pl.col("class") != -1)
)

# samples to select - samples where class != -1
y

sample_ids,class
str,i64
"""V108""",1
"""V125""",1
"""V1321""",1
"""V1456""",1
"""V1505""",0
…,…
"""V637""",0
"""V716""",1
"""V777""",0
"""V806""",1


In [32]:
preprocessed_data_folder = "mds_data/preprocessed_74"
output_dir = "mds_data/splits_74_mutation"

In [33]:
mrna = pl.read_csv(preprocessed_data_folder + "/mrna.csv")

OmicDataSplitter(
    df=mrna,
    annotation_cols=["GENE_ID", "GENE_NAME"],
    y_df=y,
    n_features=200,
    n_splits=5,
    random_state=3,
    output_dir=output_dir + "/mrna",
).process_data()

Kept 26 samples out of 74


100%|██████████| 200/200 [00:23<00:00,  8.69it/s]?fold/s]
100%|██████████| 200/200 [00:23<00:00,  8.50it/s]34, 23.52s/fold]
100%|██████████| 200/200 [00:22<00:00,  8.76it/s]11, 23.84s/fold]
100%|██████████| 200/200 [00:22<00:00,  8.78it/s]47, 23.66s/fold]
100%|██████████| 200/200 [00:22<00:00,  8.77it/s]23, 23.51s/fold]
Processing folds: 100%|██████████| 5/5 [01:57<00:00, 23.52s/fold]


In [34]:
mirna = pl.read_csv("mds_data/preprocessed_74/mirna_genes.csv")

OmicDataSplitter(
    df=mirna,
    annotation_cols=["GENE_ID", "GENE_NAME"],
    y_df=y,
    n_features=200,
    n_splits=5,
    random_state=3,
    output_dir=output_dir + "/mirna_genes",
).process_data()

Only 26 samples out of 74 found in y_df


100%|██████████| 200/200 [00:04<00:00, 46.89it/s]?fold/s]
100%|██████████| 200/200 [00:04<00:00, 47.32it/s]17,  4.36s/fold]
100%|██████████| 200/200 [00:04<00:00, 46.82it/s]13,  4.34s/fold]
100%|██████████| 200/200 [00:04<00:00, 46.83it/s]08,  4.34s/fold]
100%|██████████| 200/200 [00:04<00:00, 45.02it/s]04,  4.35s/fold]
Processing folds: 100%|██████████| 5/5 [00:21<00:00,  4.38s/fold]


In [None]:
mirna = pl.read_csv("mds_data/preprocessed_74/mirna.csv")

OmicDataSplitter(
    df=mirna,
    annotation_cols=["miRNA"],
    y_df=y,
    n_features=200,
    n_splits=5,
    random_state=3,
    output_dir=output_dir + "/mirna",
).process_data()

In [35]:
circrna = pl.read_csv("mds_data/preprocessed_74/circrna.csv")

# fill the null ids with unknown id n -> this should be added to the preprocessing before
circrna_ids = circrna["circRNA_ID"]
unknown_id = 0
for i in range(len(circrna_ids)):
    if not circrna_ids[i]:
        circrna_ids[i] = f"unknown_id_{unknown_id}"
        unknown_id += 1
circrna = circrna.with_columns(pl.Series("circRNA_ID", circrna_ids))

ods = OmicDataSplitter(
    df=circrna,
    annotation_cols=["circRNA_ID", "GENE_ID", "GENE_TYPE", "GENE_NAME", "STRAND"],
    y_df=y,
    n_features=200,
    n_splits=5,
    random_state=3,
    output_dir=output_dir + "/circrna",
)
ods.process_data()

Only 26 samples out of 74 found in y_df


100%|██████████| 200/200 [00:06<00:00, 28.94it/s]?fold/s]
100%|██████████| 200/200 [00:06<00:00, 28.91it/s]28,  7.03s/fold]
100%|██████████| 200/200 [00:06<00:00, 29.04it/s]21,  7.04s/fold]
100%|██████████| 200/200 [00:06<00:00, 29.02it/s]14,  7.03s/fold]
100%|██████████| 200/200 [00:06<00:00, 28.95it/s]07,  7.02s/fold]
Processing folds: 100%|██████████| 5/5 [00:35<00:00,  7.02s/fold]


In [None]:
pirna = pl.read_csv("mds_data/preprocessed/pirna.csv")

OmicDataSplitter(
    df=pirna,
    annotation_cols=["piRNA_name", "piRNA_id"],
    y_df=y,
    n_features=150,
    n_splits=5,
    random_state=3,
    output_dir=output_dir + "/pirna",
).process_data()

In [36]:
te = pl.read_csv("mds_data/preprocessed_74/te_counts.csv")
te

OmicDataSplitter(
    df=te,
    annotation_cols=["TE"],
    y_df=y,
    n_features=200,
    n_splits=5,
    random_state=3,
    output_dir=output_dir + "/te_counts",
).process_data()

Only 26 samples out of 74 found in y_df


100%|██████████| 200/200 [00:06<00:00, 31.53it/s]?fold/s]
100%|██████████| 200/200 [00:06<00:00, 31.12it/s]25,  6.49s/fold]
100%|██████████| 200/200 [00:06<00:00, 30.51it/s]19,  6.51s/fold]
100%|██████████| 200/200 [00:06<00:00, 31.06it/s]13,  6.58s/fold]
100%|██████████| 200/200 [00:06<00:00, 30.83it/s]06,  6.57s/fold]
Processing folds: 100%|██████████| 5/5 [00:32<00:00,  6.56s/fold]
