In [171]:
%load_ext autoreload
%autoreload 2

import numpy as np
import polars as pl
from src.seqeuencing_data_preprocessing import preprocess_seq_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# load and transform annotations

In [172]:
# load sample_sheet
sample_sheet = pl.read_excel("mds_data/raw/sample sheet for CVUT.xlsx")

annotations = (
    sample_sheet.with_columns(
        pl.col("SAMPLE_NAME")
            .str.split("_")
            .map_elements(lambda x: x[0], return_dtype=pl.String)
            .alias("SAMPLE_ID")
    )
    .sort("SAMPLE_ID")
    .drop("RUN", "PLATFORM", "N_UNIQ_MAP_READS", "UNIQ_MAP")
    .rename(
        {"1 disease": "disease", "2 risk" : "risk", "3 mutations (SF3B1only_wt)" : "mutations"}
    )
    .select("SAMPLE_NAME","SAMPLE_ID","GROUP","disease","risk","mutations")
)

# annotations.write_csv("mds_data/preprocessed/annotations.csv")

# load mRNA

In [173]:
mrna = pl.read_csv("mds_data/raw/200625_allRNA_fromRNAseq_annot_hg38.tsv", separator="\t")
with pl.Config(tbl_cols=-1, tbl_rows=-1):
    print(mrna["GENE_TYPE"].value_counts(sort=True))

shape: (46, 2)
┌─────────────────────────────────┬───────┐
│ GENE_TYPE                       ┆ count │
│ ---                             ┆ ---   │
│ str                             ┆ u32   │
╞═════════════════════════════════╪═══════╡
│ protein_coding                  ┆ 19873 │
│ processed_pseudogene            ┆ 10209 │
│ lincRNA                         ┆ 7430  │
│ antisense                       ┆ 5479  │
│ unprocessed_pseudogene          ┆ 2645  │
│ misc_RNA                        ┆ 2212  │
│ snRNA                           ┆ 1900  │
│ miRNA                           ┆ 1879  │
│ TEC                             ┆ 1065  │
│ snoRNA                          ┆ 943   │
│ sense_intronic                  ┆ 893   │
│ transcribed_unprocessed_pseudo… ┆ 853   │
│ processed_transcript            ┆ 550   │
│ rRNA                            ┆ 542   │
│ transcribed_processed_pseudoge… ┆ 471   │
│ IG_V_pseudogene                 ┆ 188   │
│ sense_overlapping               ┆ 179   │
│ IG_V_gene      

In [174]:
# mrna
mrna = pl.read_csv("mds_data/raw/200625_allRNA_fromRNAseq_annot_hg38.tsv", separator="\t")
mrna = mrna.filter(pl.col("GENE_TYPE") == "protein_coding")

mrna = mrna.drop(["CHR", "START", "END", "GENE_TYPE"])
gene_ids = mrna["GENE_ID"]

rename_dict = {col: col.split("_")[0] for col in mrna.columns[2:]}
mrna = mrna.rename(rename_dict)

mrna_names = mrna.columns[2:]
mrna

GENE_ID,GENE_NAME,N54,N58,N60,N70,N82,N83,N84,N85,N86,N87,NV1428,NV911,NV912,V1048,V108,V1090,V1249,V125,V1279,V1297,V1321,V1337,V1394,V1422,V1426,V1441,V1456,V148,V1505,V1528,V1554,V1565,V1577,V1591,V1592,…,V1921,V2089,V2092,V2110,V2133,V221,V2241,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V681,V708,V712,V714,V716,V777,V788,V795,V806,V833,V839,V853,V883,V888,V940,V956,V957
str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""ENSG00000188026""","""RILPL1""",407,235,244,266,89,232,198,246,134,279,131,133,114,404,60,207,196,172,146,256,393,195,110,106,463,128,239,203,223,153,65,384,146,226,210,…,200,120,362,315,93,170,376,120,715,180,314,263,128,426,352,164,538,44,131,432,262,98,65,92,355,167,283,122,458,439,139,260,273,474,364,176,69
"""ENSG00000167578""","""RAB4B""",951,951,856,905,749,865,1441,682,851,681,849,555,454,764,1080,812,1214,710,1111,613,1526,1348,1390,765,1665,527,1259,1026,1317,1183,412,1682,1278,1177,753,…,914,1798,937,812,304,636,1165,1137,1514,1024,920,989,948,2094,1350,1989,1261,460,685,1374,778,831,786,1370,1304,882,3043,1605,801,1118,907,1815,887,1130,813,0,1242
"""ENSG00000078237""","""TIGAR""",580,184,691,258,393,356,288,482,275,277,325,240,329,404,301,240,777,296,404,173,343,504,878,133,374,686,606,485,322,309,151,508,731,488,568,…,332,511,424,218,142,304,511,590,475,564,366,400,219,998,376,817,609,80,342,408,471,336,217,630,1098,243,746,361,499,486,1073,317,328,362,227,242,522
"""ENSG00000158486""","""DNAH3""",18,0,16,8,30,4,10,0,18,9,3,3,10,0,3,0,27,29,0,12,97,0,4,2,31,6,0,195,8,0,8,11,7,3,0,…,0,12,92,17,266,0,7,8,4,3,3,0,2,20,43,0,12,0,134,6,0,4,9,80,23,10,227,0,3,47,109,0,0,19,2,0,17
"""ENSG00000283967""","""RP11-432M8.3""",0,1,0,0,2,2,1,2,1,1,0,0,1,0,1,0,2,3,0,3,2,0,0,0,0,1,0,0,1,2,3,0,2,1,0,…,0,0,7,2,8,0,1,0,6,1,0,0,0,2,0,0,1,0,7,1,0,0,1,0,0,0,1,0,0,2,1,0,4,2,1,1,3
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ENSG00000186115""","""CYP4F2""",152,182,25,144,178,120,105,76,257,203,114,26,57,0,129,487,449,81,381,43,139,32,0,324,60,621,138,255,115,30,5,281,910,50,1116,…,116,22,14,474,58,59,119,248,38,8,11,127,177,8,96,458,157,191,58,213,308,752,481,237,138,7,468,545,11,143,226,70,152,84,171,0,278
"""ENSG00000009694""","""TENM1""",22,32,11,42,20,32,26,25,75,33,19,30,3,11,173,5,54,31,25,46,93,2876,28,26,40,48,7,90,23,17,52,49,55,29,7,…,23,31,78,126,108,138,15,232,231,107,38,7,18,203,39,4,16,6,777,11,59,46,18,54,58,35,97,36,19,44,57,30,23,36,32,0,412
"""ENSG00000123685""","""BATF3""",127,67,134,147,23,216,82,90,51,58,64,17,72,83,86,178,147,124,76,70,50,115,34,100,185,166,128,131,150,73,49,227,239,103,48,…,33,104,109,92,98,315,119,37,223,60,52,74,67,184,182,140,351,25,76,210,185,90,181,168,343,262,165,315,472,215,106,366,137,131,84,92,53
"""ENSG00000105063""","""PPP6R1""",22845,28193,26489,21006,17065,22682,26769,18729,20738,22708,18446,18690,18843,24906,15464,18758,21505,28065,21519,26893,24247,22835,31129,19041,93501,24492,25224,19994,24718,21241,8544,43097,30355,28033,17662,…,17448,34875,15198,22322,4353,14988,20336,32244,31834,21384,15798,18774,17235,29815,30757,27047,23945,7123,14442,23880,29027,27304,13041,20407,23342,15997,46899,40751,27759,24271,16824,33950,23426,30607,25504,0,25170


- accounted for sequencing depth between different samples -> gene counts comparable between different samples now
- computed scale invariant variance for each feature

# load miRNA

In [175]:
# load mirna
mirna = pl.read_excel("mds_data/raw/final_all_samples_miRNA_seq.xlsx")

mirna_rename_dict = {
    col: col.split("_")[0] for col in mirna.columns[1:]
}
mirna = mirna.rename(mirna_rename_dict)
mirna_names = mirna.columns[1:]

mirna

miRNA,V1565,N58,V1874,V777,N80,V1788,N65,V2368,N81,N59,V2286,V406,V100,N82,V2133,V574,V2115,V1921,V714,V637,V1742,V1744,V2248,V1428,V18,V1857,V839,V912,V1048,V911,V940,V681,V708,N60,N70,V148,…,V1441,V1699,V1297,V1321,V1505,V1249,V1456,V1426,V1394,V1592,V1528,V1591,V833,V1708,V1800,V1776,V1823,V1775,V1834,V2378,V2414,V1860,V1884,V1920,V2322,V2311,V2291,V1957,V2092,V2284,V2278,V2110,V2179,V2147,V2224,V2089,V788
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""hsa-let-7a-2-3p""",23,0,11,12,11,0,13,12,12,36,15,0,35,0,17,10,12,10,13,0,0,18,58,11,12,60,0,17,26,13,18,0,13,0,0,0,…,0,15,10,0,27,0,0,0,0,10,0,18,0,0,10,19,79,12,0,17,24,17,0,12,33,0,13,0,10,12,17,47,0,13,26,20,12
"""hsa-let-7a-3p""",1224,1619,3376,1972,1811,1490,589,1721,2311,1272,1211,1291,280,1455,197,1503,1785,1379,1150,877,1239,5322,1659,1081,4406,431,1859,2536,2123,2414,1062,1164,1876,642,663,1079,…,1970,1087,1096,1382,1564,2106,1657,542,1472,1084,1287,1164,1887,1440,1033,1326,879,2074,1043,3247,1356,1351,1133,242,3349,1745,1555,1015,1132,1667,3147,1072,797,1027,4693,1156,2476
"""hsa-let-7a-5p""",608369,933004,1069405,850554,594247,1153938,272068,927860,608709,612056,445097,781218,122906,565251,118793,305157,1480722,512584,513964,414213,647469,1169076,569087,430376,821909,424407,145849,198109,250684,127034,181953,412468,315491,546646,555368,669588,…,803740,600765,772108,873003,522350,980458,359406,173762,1578499,967461,538431,420551,641671,1141110,1320356,598512,423203,827435,475608,1073692,604248,491170,735693,321751,654808,587831,722210,522961,451919,622551,737114,464613,481072,606788,716735,575703,175859
"""hsa-let-7b-3p""",1001,932,1023,1177,614,1251,285,1086,615,825,617,880,185,796,280,428,1437,571,588,477,682,2064,710,653,1556,460,639,1164,811,964,528,947,695,578,799,1046,…,1406,540,1149,1080,955,1515,567,483,1356,867,894,504,1018,1922,960,678,434,1138,726,1606,857,800,934,402,1031,972,980,648,592,1132,1185,842,766,959,1045,1071,1190
"""hsa-let-7b-5p""",342092,304774,294765,290467,176982,430718,85112,340512,176784,226291,145897,319534,43416,204520,37736,85009,435604,149807,181362,153956,268507,409761,169404,160704,272913,189890,67291,91100,104068,41298,72557,128479,137384,163905,191757,281571,…,324863,195570,401014,305985,171342,412420,125376,125395,653889,414149,191121,140175,196653,639686,607506,170347,151499,285312,179951,392510,187292,148536,284648,23841,199909,210815,244025,61806,150372,254426,242105,175841,174812,279244,212584,214623,47734
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""hsa-miR-98-5p""",3623,6190,11127,5755,7987,7638,3615,4912,8379,6262,5413,5450,1803,3653,1182,8473,10794,6428,3961,4019,4443,18940,5849,3854,16665,2817,1123,979,1967,1027,1221,2805,1337,3109,3969,3602,…,3447,5269,3617,3639,8158,8577,3443,812,8640,3890,6773,6405,4029,3371,6547,4734,5314,6176,3302,6752,7932,8478,4598,611,7403,5955,10223,1818,5579,7627,6885,5565,5088,4885,7689,4639,1691
"""hsa-miR-99a-3p""",145,578,2236,1246,721,1013,403,1020,834,636,206,571,88,891,124,309,1206,545,472,163,1245,1755,318,390,1226,325,286,445,514,429,572,295,541,305,961,440,…,781,337,754,1960,399,591,522,86,1033,302,200,439,1310,546,683,377,132,1758,1193,846,396,439,255,455,860,322,232,1239,392,643,765,260,205,689,1085,243,428
"""hsa-miR-99a-5p""",2325,5912,46227,14205,10895,7921,13504,8229,18405,20334,1753,5294,690,4773,3681,6721,25169,12275,2998,9710,5980,14416,4179,2720,13514,3690,2604,1975,2367,3836,2411,2446,2441,2907,9476,4754,…,7129,3175,12337,15035,2854,9555,2547,1000,9952,2709,1768,2669,19219,5157,5804,3801,1972,13403,13523,8872,7846,6208,1859,3834,5936,3779,2658,12357,3347,9017,8966,3626,2045,5751,10492,1472,2934
"""hsa-miR-99b-3p""",346,489,462,1045,264,385,156,388,267,314,118,377,80,330,79,298,1007,144,305,65,152,568,214,154,261,204,168,193,200,146,276,215,310,199,332,412,…,377,143,554,580,272,342,126,258,368,453,276,279,1007,266,548,179,96,618,287,778,356,155,83,600,358,312,696,1094,168,442,277,171,129,288,348,163,169


In [176]:
len(set(mrna_names).intersection(set(mirna_names)))

77

# load circRNA

In [177]:
circrna = pl.read_csv("mds_data/raw/200625_circRNA_fromRNAseq_annot_hg19.tsv", separator="\t", null_values=["NA"])
# replace all null values for genes with 0
circrna = circrna.with_columns(pl.all().exclude("CHR","START", "END", "TYPE", "STRAND",	"circRNA_ID", "GENE_ID", "GENE_TYPE", "GENE_NAME").fill_null(strategy="zero"))
# basic filtering
circrna = circrna.filter(
   (pl.col("GENE_TYPE").is_in(["protein_coding", "lincRNA", "antisense"])) &
   (pl.col("TYPE") == "exon")
).drop("CHR", "START", "END", "TYPE")

circrna_rename_dict = {
    col: col.split("_")[0] for col in circrna.columns[5:]
}
circrna = circrna.rename(circrna_rename_dict)
# circrna[:, :9]
circrna_names = circrna.columns[5:]
circrna

STRAND,circRNA_ID,GENE_ID,GENE_TYPE,GENE_NAME,N54,N58,N60,N70,N82,N83,N84,N85,N86,N87,NV1428,NV911,NV912,V1048,V108,V1090,V1249,V125,V1279,V1297,V1321,V1337,V1394,V1422,V1426,V1441,V1456,V148,V1505,V1528,V1554,V1565,…,V1921,V2089,V2092,V2110,V2133,V221,V2241,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V681,V708,V712,V714,V716,V777,V788,V795,V806,V833,V839,V853,V883,V888,V940,V956,V957
str,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""-""","""hsa_circ_0009205""","""ENSG00000188976""","""protein_coding""","""NOC2L""",2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,4,0,0,0,0,0
"""-""","""hsa_circ_0000002""","""ENSG00000078808""","""protein_coding""","""SDF4""",17,23,25,23,22,23,32,27,35,26,23,13,8,13,10,18,23,31,16,15,49,31,104,16,34,23,27,32,25,35,32,28,…,11,68,12,8,0,0,8,38,34,19,19,17,17,36,11,53,23,5,33,41,34,28,10,25,21,33,49,53,29,11,5,123,33,33,18,29,15
"""-""",,"""ENSG00000107404""","""protein_coding""","""DVL1""",2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""-""",,"""ENSG00000160075""","""protein_coding""","""SSU72""",3,0,0,0,0,2,0,2,0,3,0,4,0,3,0,2,0,3,0,4,4,0,0,0,10,0,0,0,0,0,0,4,…,3,0,0,0,0,0,0,3,0,0,0,0,2,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""-""","""hsa_circ_0000007""","""ENSG00000078369""","""protein_coding""","""GNB1""",6,6,0,2,3,4,8,7,4,4,5,3,4,5,6,0,14,3,4,6,9,4,4,2,7,4,0,6,0,2,7,0,…,5,24,0,0,0,5,3,9,2,3,2,5,3,7,4,4,0,3,4,0,4,2,0,5,7,5,0,17,6,0,8,2,0,8,3,2,2
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""+""",,"""ENSG00000182150""","""protein_coding""","""ERCC6L2""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
"""-""",,"""ENSG00000136861""","""protein_coding""","""CDK5RAP2""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
"""+""",,"""ENSG00000107164""","""protein_coding""","""FUBP3""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
"""-""",,"""ENSG00000160271""","""protein_coding""","""RALGDS""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2


# load te counts

In [178]:
te_counts = pl.read_csv("mds_data/raw/TE_counts.csv")
te_counts_rename_dict = {
    col: col.split("_")[0] for col in te_counts.columns[1:]
}
te_counts = te_counts.rename(te_counts_rename_dict)
te_counts_names = te_counts.columns[1:]
te_counts

TE,V1744,V2286,V1776,V220,V108,V2089,V221,V806,V1920,N60,V777,V1048,V1394,V1554,V344,V2110,V630,V2311,V1592,V480,V1884,V1337,NV911,V1249,V1565,V712,V714,N85,V2241,V1708,V18,V833,V1800,V1664,V1426,V839,…,V1591,V716,V655,V2092,V883,V67,NV1428,V1834,V382,N87,V125,V2368,V1775,N54,V2147,V681,V957,V1297,V940,N58,V2414,V637,V2284,V1689,V2248,V1823,V553,V148,V1505,V2322,V624,V2179,V1456,V1742,V1577,V456,V689
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""ALU""",5887.91,5431.56,6959.7,8833.7,8986.45,10564.3,7065.91,7599.44,9787.8,9306.07,6061.99,6385.99,5579.7,4948.74,8698.03,5222.05,6877.89,8274.9,7800.57,5598.86,13599.0,7430.7,6199.62,12059.9,7059.21,6516.82,6303.3,7290.05,6080.92,9530.9,6264.46,10614.5,8582.4,5114.27,10942.3,7881.51,…,6148.82,11432.6,7078.72,4629.16,6081.12,8766.84,8097.26,9892.05,6815.32,7966.3,8590.44,6934.2,8178.09,8622.08,4299.38,9650.8,8026.74,7721.05,6593.15,7019.28,9786.57,7523.42,7382.74,6771.68,6835.69,4946.98,13410.6,7903.22,6618.57,4923.12,6690.64,4851.81,5909.45,9511.53,8769.84,8183.79,5019.56
"""AluJb""",21104.5,19944.4,23285.4,30996.4,27678.3,25346.5,26280.0,28709.9,38261.4,35332.8,22834.9,22606.9,19816.4,17392.3,26701.3,22804.4,24896.5,33242.2,28009.3,20592.4,44735.8,24996.5,21670.0,44371.0,27218.6,23824.4,21455.6,27695.8,23690.8,29308.3,21639.5,40095.7,33905.1,20905.2,35242.6,27292.4,…,24160.7,40157.1,26045.3,18683.8,22348.0,22323.5,25259.1,40239.7,27603.6,27669.4,34593.1,27649.2,20355.4,26817.6,16289.2,35666.1,27516.2,24279.6,23699.4,26406.7,23972.5,21814.8,27724.2,23908.0,29645.6,17890.9,59175.4,32247.8,21067.6,17729.7,24140.9,18699.9,21118.1,35568.6,36400.8,28090.8,17515.2
"""AluJo""",3241.23,3076.36,2831.86,4266.9,2273.46,101.421,3609.03,4184.29,5137.53,4933.68,2941.52,3670.61,2744.18,2217.45,3709.33,3165.13,2666.19,3452.7,3451.88,3147.54,4257.09,3504.48,3348.82,5805.75,3744.24,3006.96,2624.93,3677.09,3107.17,2902.06,2626.65,5094.82,4512.76,2585.24,4192.17,3839.13,…,3312.06,3849.75,3315.78,2532.95,3394.73,74.3837,2740.94,4179.26,3611.5,3584.89,4826.98,3597.06,59.6774,2462.54,2374.59,4874.16,3463.95,2681.86,3666.81,4019.45,33.1035,8.64467,4039.87,3335.72,3797.78,2786.46,7165.52,4382.27,2930.32,2043.76,3233.89,2758.92,2426.03,5215.14,4723.25,3125.83,2670.85
"""AluJr""",863.084,1021.31,943.936,1472.02,1141.14,936.128,1158.42,1238.01,1694.92,1550.93,1003.92,1048.49,688.697,927.231,1286.5,930.15,1063.58,1443.85,1135.73,774.718,1806.28,1002.38,935.308,1769.08,1180.33,1130.95,886.429,1153.25,863.661,1213.18,842.567,1812.97,1278.44,641.212,1534.48,1087.77,…,939.429,1840.33,1299.6,881.944,1015.1,964.59,1032.61,1870.29,1263.94,1316.59,1348.67,1158.08,833.327,1099.74,804.413,1579.61,1213.87,1046.95,1163.09,1220.17,918.734,1019.45,1235.09,1146.1,1277.6,736.021,2560.71,1347.83,802.787,724.04,1000.22,803.441,857.841,1759.8,1563.59,1174.99,554.36
"""AluJr4""",6219.15,5283.81,5759.8,7817.57,6722.98,7226.66,6557.64,7286.93,9023.09,8444.05,5932.29,6158.79,5702.57,4410.33,7028.46,5670.5,6414.47,7431.23,7221.28,5169.38,10841.0,6421.22,5709.1,10130.3,7417.67,5925.11,5720.99,7137.76,5902.87,7221.67,6530.15,9500.76,8615.35,5656.09,9291.27,6929.24,…,6496.37,10055.5,6111.24,4784.68,5684.14,5960.01,6608.18,9185.61,6911.11,7409.45,8564.76,6298.29,5168.7,6563.21,4846.74,8963.76,7385.98,5862.15,6350.26,6707.35,6238.01,5240.83,6716.72,6094.02,6534.11,5106.92,13894.9,7512.87,6093.45,4434.0,6364.48,5424.9,5299.94,8767.59,9004.87,6738.61,4403.27
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""X5A_LINE""",8.0,12.0,5.0,14.0,21.0,5.0,13.0,8.0,11.0,9.0,8.0,8.0,9.0,7.0,12.0,1.0,4.0,5.0,24.0,10.0,3.0,11.0,1.0,14.0,2.0,5.0,3.0,0.0,14.0,12.0,6.0,9.0,7.0,8.0,1.0,16.0,…,11.0,14.0,4.0,6.0,8.0,3.0,10.0,14.0,12.0,2.0,8.0,3.0,15.0,9.0,11.0,5.0,5.0,4.0,14.0,4.0,16.0,1.0,11.0,8.0,7.0,10.0,11.0,7.0,5.0,5.0,4.0,7.0,7.0,6.0,6.0,16.0,6.0
"""X5B_LINE""",3.0,5.0,8.0,5.0,7.0,7.0,7.0,14.0,1.0,13.0,0.0,4.0,2.0,0.0,3.0,10.0,8.0,2.0,5.0,12.0,5.0,3.0,3.0,1.0,8.0,5.0,15.0,11.0,12.0,3.0,12.0,8.0,4.0,16.0,6.0,10.0,…,5.0,19.0,6.0,3.0,15.0,4.0,6.0,0.0,14.0,8.0,8.0,1.0,1.0,10.0,4.0,9.0,6.0,6.0,10.0,8.0,9.0,0.0,11.0,6.0,8.0,1.0,12.0,22.0,1.0,3.0,6.0,2.0,13.0,10.0,4.0,3.0,1.0
"""X6A_LINE""",0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
"""X6B_LINE""",1.0,5.0,3.0,1.0,7.0,5.0,3.0,5.0,0.0,6.0,4.0,5.0,2.0,0.0,3.0,1.0,1.0,4.0,3.0,3.0,0.0,1.0,0.0,0.0,2.0,4.0,8.0,2.0,4.0,1.0,4.0,4.0,5.0,2.0,3.0,4.0,…,1.0,13.0,12.0,11.0,1.0,0.0,2.0,11.0,6.0,5.0,4.0,6.0,6.0,6.0,2.0,4.0,7.0,2.0,1.0,0.0,4.0,0.0,3.0,1.0,3.0,0.0,3.0,4.0,2.0,3.0,5.0,1.0,0.0,2.0,3.0,4.0,5.0


# load pirna (skipped for now)

In [179]:
pirna = pl.read_excel("mds_data/raw/piRNA_counts.xlsx")
pirna_rename_dict = {
    col: col.split("_")[0] for col in pirna.columns[1:]
}
pirna = pirna.rename(pirna_rename_dict)
pirna_names = pirna.columns[1:]

pirna = pirna.with_columns([
    pl.col("piRNA").str.split("/").list.get(0).alias("piRNA_name"),
    pl.col("piRNA").str.split("/").list.get(2).alias("piRNA_id")
]).drop("piRNA")

pirna

V1565,N58,V1874,V777,N80,V1788,N65,V2368,N81,N59,V2286,V406,V100,N82,V2133,V574,V2115,V1921,V714,V637,V1742,V1744,V2248,V1428,V18,V1857,V839,V912,V1048,V911,V940,V681,V708,N60,N70,V148,V655,…,V1297,V1321,V1505,V1249,V1456,V1426,V1394,V1592,V1528,V1591,V833,V1708,V1800,V1776,V1823,V1775,V1834,V2378,V2414,V1860,V1884,V1920,V2322,V2311,V2291,V1957,V2092,V2284,V2278,V2110,V2179,V2147,V2224,V2089,V788,piRNA_name,piRNA_id
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""hsa_piR_006779""","""DQ579258"""
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""hsa_piR_007653""","""DQ580430"""
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""hsa_piR_009540""","""DQ582872"""
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""hsa_piR_000302""","""DQ570363"""
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""hsa_piR_000390""","""DQ570472"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
315,274,230,160,333,154,116,192,213,244,83,147,332,275,94,50,239,168,515,14,122,162,188,111,309,573,18,18,50,59,45,98,70,133,203,138,176,…,225,170,193,210,79,42,418,327,144,70,209,246,319,128,88,160,214,139,183,87,94,127,212,78,136,187,67,154,203,222,139,273,161,175,43,"""hsa_piR_020814""","""DQ598650"""
316,458,493,422,279,266,140,417,243,395,220,74,361,450,77,120,715,170,210,12,289,688,336,203,604,137,104,193,270,276,265,202,410,132,225,291,937,…,157,1068,389,405,177,89,1005,578,329,314,574,551,458,187,117,421,669,646,183,250,170,131,635,248,170,864,126,293,571,231,224,281,574,183,120,"""hsa_piR_020815""","""DQ598651"""
2732,1064,1346,610,1598,894,848,528,1525,1060,810,565,824,1131,380,1242,833,1493,2428,146,901,1991,378,714,3829,766,226,175,544,380,632,547,312,589,472,1735,1421,…,383,193,1198,264,610,112,492,475,1173,1241,330,385,292,375,506,325,687,330,1212,777,665,138,1620,357,621,874,573,661,1069,1170,784,749,1192,921,391,"""hsa_piR_020829""","""DQ598677"""
131,177,441,190,287,281,247,164,429,191,119,190,51,158,36,254,325,244,195,152,139,717,376,227,657,43,201,109,104,249,112,246,45,61,111,181,43,…,149,156,276,444,135,24,132,141,199,153,191,135,152,239,130,275,173,222,322,465,188,137,556,240,166,385,133,400,496,261,98,171,398,271,217,"""hsa_piR_021032""","""DQ598918"""


# common samples

In [180]:
annotations = pl.read_csv("mds_data/preprocessed/annotations.csv")
annotated_samples = annotations['SAMPLE_ID'].to_list()

In [181]:
from src.data_cleaning import get_common_elements

common_names = get_common_elements(
    annotated_samples, mrna_names, mirna_names, circrna_names, pirna_names, te_counts_names
)

common_names.sort()

print(f"lengths: {len(mrna_names)=}, {len(mirna_names)=}, {len(circrna_names)=}, {len(te_counts_names)=}")
len(common_names)

lengths: len(mrna_names)=86, len(mirna_names)=105, len(circrna_names)=86, len(te_counts_names)=112


66

### notes about overlapping data
    - mirna_names, te_counts_names = 98 (+12, most data available)
    - mrna, circrna = 86 (all of mrna data, baseline number of samples)
    - mrna, circrna, te_counts = 85 (-1)
    - mrna, circrna, te_counts, mirna = 77 (-8)
    - mrna_names, pirna_names, circrna_names, te_counts_names = 76 (-9)
    - mrna_names, mirna, pirna_names, circrna_names, te_counts_names = 76 (-9)
    - annotations only exist for 66 (-19) samples

- there is data both for mirna pre-cursor genes (in the allRNA dataframe) and both for the mature mirnas (which should be generally less noisy, even though both are correlated)

## select common samples from each df

In [182]:
mrna = mrna.select("GENE_ID", "GENE_NAME", *common_names)

In [183]:
mirna = mirna.select("miRNA", *common_names)

In [184]:
circrna = circrna.select("STRAND", "circRNA_ID", "GENE_ID", "GENE_TYPE", "GENE_NAME", *common_names)

In [185]:
pirna = pirna.select("piRNA_name", "piRNA_id", *common_names)

In [186]:
te_counts = te_counts.select("TE", *common_names)

# data preprocessing

In [197]:
preprocess_seq_data(df=mrna, annotation_cols=["GENE_ID", "GENE_NAME"], min_median_expression=100, min_detection_rate=0.2, top_n=3000, output_path="mds_data/preprocessed/mrna.csv")

GENE_ID,GENE_NAME,N58,N60,N70,N82,N83,N84,N85,V1048,V108,V1090,V125,V1279,V1297,V1321,V1337,V1394,V1441,V1456,V148,V1505,V1528,V1565,V1591,V1592,V1699,V1708,V1742,V1776,V1788,V18,V1800,V1823,V1834,V1857,V1860,V1874,V1884,V1920,V1921,V2089,V2092,V2110,V2133,V221,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V712,V714,V716,V777,V806,V839,V883,V888,V940
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""ENSG00000103184""","""SEC14L5""",4.574155,7.288239,3.561246,2.897224,3.615696,4.057362,2.203009,4.398603,3.467058,3.657675,4.619424,1.507977,4.809069,4.749144,2.146183,2.66225,3.555635,0.714897,2.010324,6.268556,5.914118,4.018251,6.781959,4.900504,3.481129,3.767197,5.167452,1.631362,5.933825,4.945851,0.444571,2.734007,4.452827,5.877369,6.07671,3.343197,6.156233,2.993291,5.727105,1.36271,5.001145,5.516594,6.915722,3.82929,3.319966,6.938185,5.118215,1.629126,5.885573,5.003287,2.039422,10.53412,2.297346,4.097657,3.891976,4.604959,5.844351,3.188559,1.986391,2.357737,3.549326,5.252131,3.976603,5.670529,4.456624,4.897539
"""ENSG00000119862""","""LGALSL""",7.455039,8.479401,6.306391,3.321911,6.188541,5.979059,5.223318,6.5576,6.140982,7.953992,6.599124,5.548295,8.013039,7.647353,4.956439,6.928581,5.090716,6.687677,5.006865,9.311978,8.342994,6.413821,9.084263,6.776719,5.304247,6.598114,6.798145,4.599317,8.505406,6.422409,2.381453,6.143426,5.936016,8.052438,7.556633,6.799104,7.551962,6.284811,7.092467,3.705927,6.91619,6.314894,6.214188,5.860457,5.514356,8.538408,7.514117,6.082184,9.708378,6.616514,4.57679,12.548,5.836737,6.590203,6.043704,7.598248,7.729715,6.108103,5.62265,5.729892,5.977239,6.672379,6.838144,7.450425,6.055094,6.88753
"""ENSG00000115461""","""IGFBP5""",1.531313,4.93984,1.589817,3.596918,4.221713,5.694364,4.633449,7.55643,2.906119,6.264272,4.300236,4.642322,5.192593,5.82839,3.354076,6.16894,3.134675,4.718852,8.624212,3.38863,3.416752,8.240399,6.652194,7.699936,4.466548,3.895035,8.655896,4.393156,3.639881,5.553176,2.653826,6.227905,6.201601,9.962816,5.073338,4.806445,4.322065,3.59729,4.244579,4.54983,5.835025,6.82944,7.039622,4.689163,3.702583,6.163571,2.613793,2.308573,3.431658,4.664411,3.645008,4.07661,4.614307,5.113326,2.241138,3.67498,11.902128,2.718183,2.048483,1.477194,4.562139,6.748317,2.710774,4.358541,5.20171,6.416449
"""ENSG00000137801""","""THBS1""",10.787846,12.452547,9.390428,7.256898,9.546596,9.807684,8.395226,10.564428,9.994399,10.93104,10.041629,8.696442,11.235067,10.286238,7.73148,9.966976,10.189488,11.291271,8.163448,12.26609,11.428713,9.155845,12.369392,8.678094,8.537479,11.617824,10.461017,7.391252,11.719449,8.622247,6.537787,9.79737,9.035041,11.235494,11.164126,9.850056,10.290641,7.654049,11.137592,4.781831,10.478594,10.905235,9.504674,9.022241,8.020058,12.340003,9.573093,9.417571,11.567134,10.372583,7.060916,15.367104,8.483557,10.596553,8.780714,9.818366,11.521034,6.721186,6.61051,9.027454,7.719915,10.210917,10.274893,10.805033,10.464737,10.704895
"""ENSG00000173083""","""HPSE""",6.935416,7.987946,7.20935,5.125137,6.211795,5.990862,5.462574,6.413457,7.825867,8.089623,6.922915,6.288086,7.527803,6.888642,6.086392,6.349528,6.562359,6.139168,6.209023,8.389047,7.645464,6.792004,8.256308,6.635347,6.015047,6.27286,6.943379,5.713779,8.075317,5.712303,5.481236,6.143426,6.635867,6.400915,7.285251,6.45702,8.3526,5.8538,7.281664,5.741307,6.923299,7.189791,7.09444,6.704163,4.665372,8.178149,6.607652,6.537152,7.83118,7.217909,6.983234,11.844895,5.8875,8.039698,6.089552,6.931796,7.643238,5.951342,5.730754,6.129718,7.313044,6.989801,7.451146,6.940394,6.860065,6.735584
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ENSG00000100320""","""RBFOX2""",8.22931,8.577745,8.793937,8.146168,8.50636,8.094082,8.412581,6.262423,8.824019,9.092984,8.123152,8.145417,8.134884,7.165401,7.635301,7.611847,8.661839,5.259503,7.394901,7.761067,7.65251,8.083627,6.969916,9.355596,6.342115,8.242365,6.032863,6.856847,7.337829,7.598508,5.508181,6.09535,8.075045,8.245913,7.133615,8.494605,7.07594,8.600279,8.234299,6.944529,6.121303,7.042001,7.561364,8.72022,6.691154,7.412463,5.706284,8.655129,8.009674,7.341117,2.289394,6.340976,6.346199,7.8873,7.275199,9.039884,7.44935,9.180781,8.788654,8.775473,9.47642,6.869862,7.52732,7.067754,7.545361,6.655649
"""ENSG00000131037""","""EPS8L1""",4.921367,5.092288,4.636338,4.303763,3.558799,4.273777,4.539356,3.657559,4.76519,5.2067,5.129906,5.249289,5.093011,4.608039,4.105194,3.421264,4.543386,4.570978,4.012894,4.479922,4.122518,5.584509,3.767809,3.859496,3.894477,4.436839,4.401572,4.939957,3.310004,4.954976,3.038821,3.685049,5.122019,5.154102,4.704891,4.004908,2.594864,5.67405,4.176327,3.059096,4.431902,2.579029,5.850097,4.698257,4.024754,5.202417,3.982463,6.590354,4.603486,2.425481,3.861162,4.223547,2.009216,3.985339,2.241138,5.239908,5.761738,3.852376,5.114053,5.401104,4.54089,4.377534,4.054711,4.791207,4.437371,4.463407
"""ENSG00000147138""","""GPR174""",7.472695,7.693039,8.470779,8.431479,8.423135,7.941875,7.586384,8.35664,10.20575,9.02655,8.031976,8.900432,9.261677,9.425081,9.225934,9.444781,8.44918,7.687412,8.051833,7.811535,8.339178,7.633405,7.297222,8.87963,7.210158,7.914855,6.834074,8.695152,4.52869,7.733878,10.22048,6.504612,9.587979,7.97627,8.012537,8.975441,8.739155,9.705377,8.709629,8.063136,7.39922,7.799343,7.265433,8.571631,9.109058,7.41099,4.540132,10.158007,8.317188,9.034955,8.597539,6.449957,7.772009,8.241796,7.973199,9.134118,8.364018,8.292342,8.913818,9.206939,9.270765,7.319049,8.83835,7.851796,7.858072,6.91692
"""ENSG00000152767""","""FARP1""",6.940183,7.49388,6.46252,5.675233,6.4785,6.855844,5.77485,7.895461,7.102404,5.689908,6.759296,6.770667,7.110603,7.269297,7.513641,6.432986,6.543011,6.975591,7.973696,6.347298,6.180977,7.222779,6.675958,6.867061,6.708219,7.062017,7.460929,6.16691,7.521092,7.119538,7.263643,8.050985,7.734846,9.232917,7.693798,6.712544,7.099249,6.624518,6.440258,7.417438,8.285402,7.647037,8.174316,6.902123,5.938316,8.029102,6.648249,7.433624,6.519701,6.396188,5.661561,6.111841,6.986761,7.610686,6.858305,7.330748,7.51952,6.998061,6.492616,6.120499,6.428605,7.674356,6.001296,6.299561,7.777429,6.503437


In [188]:
preprocess_seq_data(df=mirna, annotation_cols=["miRNA"], min_median_expression=10, min_detection_rate=0.2, top_n=1500, output_path="mds_data/preprocessed/mirna.csv")

miRNA,N58,N60,N70,N82,N83,N84,N85,V1048,V108,V1090,V125,V1279,V1297,V1321,V1337,V1394,V1441,V1456,V148,V1505,V1528,V1565,V1591,V1592,V1699,V1708,V1742,V1776,V1788,V18,V1800,V1823,V1834,V1857,V1860,V1874,V1884,V1920,V1921,V2089,V2092,V2110,V2133,V221,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V712,V714,V716,V777,V806,V839,V883,V888,V940
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""hsa-miR-4707-3p""",1.437319,2.868655,2.611191,1.96315,1.261456,1.819225,2.300344,3.857893,1.977615,1.612355,2.736769,1.95294,1.782329,0.987596,1.575661,1.22627,0.910819,2.307684,2.353511,2.625116,1.840606,2.634192,2.512477,1.022918,1.676363,0.0,2.183077,2.599831,1.68106,0.0,1.683612,1.801399,1.523008,10.816231,2.027256,0.680056,0.0,2.804438,2.568437,2.003521,3.42994,3.883035,4.300455,0.730621,1.577371,2.291888,1.651564,1.21204,1.846952,2.044078,2.968013,2.761515,2.915449,1.338999,2.617268,2.71419,1.020286,1.976244,2.350208,2.084655,0.0,2.853775,3.321994,2.338173,1.985976,5.661481
"""hsa-miR-4317""",0.0,0.0,2.491443,1.418228,1.469891,2.028103,1.7567,2.179682,0.918714,1.294169,1.95474,1.168984,1.904883,0.0,0.0,0.968545,0.0,1.719866,1.743709,1.657133,2.359118,2.238846,1.566001,0.0,0.0,0.950629,0.0,1.406375,0.0,0.866447,0.0,1.725976,0.0,3.147094,1.519918,0.634146,1.611668,0.0,1.793202,2.094741,8.919123,3.179177,0.0,0.943429,1.491448,1.316155,1.517217,0.0,0.0,0.0,2.334624,2.070472,0.0,0.915061,0.0,1.485195,1.15962,0.0,1.443037,0.0,0.0,1.411989,0.0,1.929196,1.086277,2.4986
"""hsa-miR-510-3p""",8.521073,8.331569,7.206841,7.832334,7.761779,7.621583,7.515796,8.858364,6.823789,6.825742,8.20419,6.6295,6.002929,5.615593,7.215579,5.670966,6.164486,7.530656,8.372939,6.613607,6.467872,9.112306,8.154453,7.690422,7.575755,5.634325,7.795855,7.577814,6.420217,8.625506,5.680818,11.308101,5.735568,16.646321,7.362756,8.561969,7.451579,8.264114,9.650701,6.984423,7.075258,7.650473,10.29136,5.981327,7.40725,7.260454,6.248953,6.438578,7.679142,6.253436,9.257809,9.451838,7.294966,6.919408,11.429695,8.543727,7.266277,8.157708,7.850544,7.953444,6.198886,7.395316,8.725328,7.67784,6.837269,8.24584
"""hsa-miR-1203""",2.467375,3.940989,3.784001,3.294088,2.212001,2.140301,3.045564,4.524436,2.906916,2.21129,4.014962,1.806145,1.551647,0.0,1.638199,4.27113,0.910819,3.610628,2.012233,1.705574,2.464095,2.690704,3.518252,2.098863,3.064153,0.0,2.183077,3.782979,1.176919,1.693459,2.456725,4.869333,3.557643,12.114166,3.415263,1.381767,2.941513,8.946766,2.678376,3.894443,4.153857,6.224417,5.654182,1.1717,2.934128,2.291888,2.182225,3.974116,2.657119,2.425925,5.205245,2.20576,4.160133,2.397042,3.204358,2.201352,0.970682,4.391799,2.558838,1.457095,1.307781,1.334922,4.320428,2.148128,2.703579,7.201495
"""hsa-miR-4655-5p""",1.028397,1.858618,2.057756,2.003828,1.204277,1.344454,1.651371,3.399611,1.649399,1.221151,1.92193,1.856755,1.393163,0.88895,1.369928,0.0,0.0,1.827393,1.564358,1.657133,1.732679,2.200438,2.00238,0.0,0.0,0.0,1.470201,2.546478,1.125735,0.0,1.242711,3.251263,1.475189,9.501852,1.519918,1.002645,1.224308,3.179175,1.337122,2.915136,3.411387,3.459814,4.096487,0.0,1.535049,1.38108,1.469503,1.305416,1.303397,2.199346,2.598185,1.754619,2.152034,1.033283,1.610864,2.251779,0.919311,1.593396,1.757045,0.0,1.165182,1.411989,2.046349,0.0,1.786904,4.819859
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""hsa-miR-505-3p""",5.813925,4.787444,5.13205,6.150255,5.894901,6.249783,6.13658,7.64054,5.590744,5.931464,6.156691,6.045456,5.692865,5.70348,6.27284,5.844364,5.706159,5.913204,6.186635,6.370516,6.79746,6.547031,6.813855,5.524623,6.27093,5.601948,5.888054,6.470982,6.512771,6.093064,4.456274,6.608551,5.230394,7.440369,6.743245,5.124075,6.1911,6.106777,6.261837,6.438179,6.661821,6.357479,5.795841,5.897639,5.575105,6.308943,5.815391,4.50373,6.362991,6.023344,5.474925,6.083046,5.795417,6.274399,5.430089,5.85627,5.227707,6.214807,6.262376,5.480225,5.808986,6.48078,6.763068,5.630038,6.203694,7.088642
"""hsa-miR-766-5p""",3.07499,2.76097,2.522325,2.900582,3.047251,3.156495,3.327849,3.218339,3.143357,2.575849,2.828141,3.338324,2.927483,2.182172,3.28024,2.271266,2.43311,2.577842,3.250196,3.199246,3.501619,3.034282,3.305837,3.150709,3.041695,1.754539,2.507642,3.341117,2.805177,2.854505,2.908198,3.475067,2.383017,3.916987,3.369988,2.648297,3.088241,3.843288,3.204679,3.344855,3.122966,3.495794,2.747446,2.168183,2.500467,2.675134,2.739907,2.189887,3.451258,2.976916,3.058334,3.178272,3.229851,2.879818,4.535242,3.198459,2.030323,1.871665,3.499981,2.167069,2.819207,2.768424,2.695668,2.656453,3.244197,2.756775
"""hsa-miR-3915""",1.847753,3.36186,3.213107,3.19217,2.780095,3.045599,3.145923,4.051015,2.923943,2.485202,2.947057,2.779912,2.452328,1.434912,2.739431,1.97614,1.333535,3.214762,2.434103,2.878525,3.190939,2.416901,3.24746,2.62296,3.192021,1.790345,2.681668,2.818151,2.209512,2.269586,1.925211,2.953558,1.657631,3.642214,2.945217,1.002645,1.99323,3.843288,2.87592,3.51964,2.471041,3.56517,4.51235,1.667149,2.208961,2.451554,2.997101,2.497484,3.079531,3.519255,3.969031,2.497104,3.396418,2.329318,1.954601,3.091766,2.005906,2.873397,3.201313,1.582515,2.770777,2.881138,3.531787,2.620301,2.37481,3.944586
"""hsa-miR-103b""",8.255029,8.819859,8.755532,8.694311,8.322652,8.366231,8.248476,9.444442,7.522033,8.082241,8.322044,8.718554,8.771537,7.730089,8.509887,7.751314,7.824188,8.256102,8.428093,8.972458,8.90072,9.109718,8.912276,8.217978,8.567063,8.225445,8.788193,8.125148,8.813943,8.674909,7.249996,9.35588,8.530168,8.661122,8.912231,8.184181,8.485192,9.380139,8.926746,8.911443,9.023977,9.231308,9.70008,7.608657,7.74034,9.30788,8.267288,7.157962,8.742572,8.971845,9.031934,9.160614,8.488311,8.613428,6.959611,8.815577,8.26416,8.426607,8.76387,8.014541,8.327705,9.009072,9.065401,8.553703,8.741949,10.019022


In [189]:
preprocess_seq_data(df=circrna, annotation_cols=["STRAND", "circRNA_ID", "GENE_ID", "GENE_TYPE", "GENE_NAME"], min_median_expression=5, min_detection_rate=0.2, top_n=1500, output_path="mds_data/preprocessed/circrna.csv")

STRAND,circRNA_ID,GENE_ID,GENE_TYPE,GENE_NAME,N58,N60,N70,N82,N83,N84,N85,V1048,V108,V1090,V125,V1279,V1297,V1321,V1337,V1394,V1441,V1456,V148,V1505,V1528,V1565,V1591,V1592,V1699,V1708,V1742,V1776,V1788,V18,V1800,V1823,V1834,V1857,V1860,V1874,V1884,V1920,V1921,V2089,V2092,V2110,V2133,V221,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V712,V714,V716,V777,V806,V839,V883,V888,V940
str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""+""","""hsa_circ_0001387""","""ENSG00000109685""","""protein_coding""","""NSD2""",8.450247,9.50236,9.273534,9.675946,9.180145,8.892947,9.673358,0.0,0.0,7.734399,9.357757,8.225304,9.105657,7.535502,9.523741,6.077786,7.314609,10.084417,11.34087,10.010991,8.640138,10.072279,10.292926,8.541385,9.906549,8.257142,10.097526,9.651289,8.62221,9.96775,8.859867,10.528114,9.369982,9.595812,9.556637,11.642622,9.537085,9.733417,8.63642,11.697294,9.665389,0.0,0.0,7.829564,9.543215,10.219459,9.891581,8.402509,9.832489,8.368302,9.899485,8.761428,8.905724,9.307617,0.0,10.287065,10.198622,8.726896,13.09702,9.105965,11.040003,9.102624,10.044361,10.497765,10.057587,9.17709
"""-""","""hsa_circ_0002484""","""ENSG00000196323""","""protein_coding""","""ZBTB44""",11.304668,10.863714,11.569357,11.567741,11.763033,11.553355,12.077916,11.372927,11.901279,11.313138,11.858446,11.957804,11.562944,11.864375,11.770121,11.963519,11.553925,11.94595,12.366122,11.679884,11.280953,12.028215,11.462212,11.823308,11.671022,12.060111,11.141242,12.042153,15.027978,11.461585,11.179308,10.528114,10.854007,10.180153,11.614074,12.368268,11.651068,11.246888,11.333789,12.535352,11.936998,12.223559,11.487746,11.216131,11.657204,11.218854,14.251592,11.305699,10.738641,11.027586,11.942734,11.421591,11.260853,11.746331,12.040887,10.734216,11.079416,11.85316,12.887593,9.742459,12.277768,12.069959,11.517418,11.719587,11.163777,10.634935
"""+""","""hsa_circ_0007848""","""ENSG00000270106""","""protein_coding""","""TSNAX-DISC1""",10.534552,10.153713,9.273534,9.86837,10.401115,10.00679,10.520569,11.220984,11.34203,9.314838,11.019217,11.028521,11.103691,11.089082,11.107396,12.471559,11.905806,10.209837,10.533932,8.554004,10.444907,10.443943,8.296372,10.997645,9.584997,10.253601,10.620687,10.457875,10.426947,10.89995,11.80712,0.0,11.953127,9.595812,9.971195,10.402229,10.1214,8.513282,10.218963,10.297469,10.539051,0.0,11.902658,10.824004,10.372444,9.978671,10.043432,10.306269,9.417979,10.949621,10.898729,9.175634,10.947838,10.668795,10.62642,11.149042,10.650804,10.424976,10.004285,10.104655,10.214562,10.837752,10.78078,10.23493,10.526696,8.762883
"""-""","""hsa_circ_0002711""","""ENSG00000180530""","""protein_coding""","""NRIP1""",11.304668,11.217183,12.427968,11.390925,10.841407,10.213053,10.672474,9.053173,10.416531,9.899046,10.231227,10.337068,8.108273,13.118828,9.845277,7.648438,8.631089,11.431534,9.704614,10.136406,10.338072,12.443166,11.292351,0.0,10.391546,9.254782,10.184912,9.972858,10.07931,10.093161,11.125893,10.528114,10.047236,11.594413,9.293985,11.527182,8.953093,12.990633,10.803522,8.656047,11.538566,0.0,12.487495,9.994547,10.073135,10.326287,11.52798,13.256367,11.355019,10.102208,0.0,10.174386,6.91472,11.498482,0.0,9.413558,10.198622,10.531816,12.13565,12.634086,13.245917,9.517005,7.87921,9.72087,10.444292,11.345075
"""-""","""hsa_circ_0000551""","""ENSG00000100678""","""protein_coding""","""SLC8A3""",10.694915,10.693886,10.774525,0.0,9.764278,9.766036,9.351871,8.63904,0.0,10.728459,10.093843,9.444939,0.0,11.415758,7.529609,7.936637,10.394053,0.0,9.342537,9.552084,9.223894,9.891885,9.615544,7.545252,9.170585,11.867514,8.422348,10.351038,8.791728,10.093161,8.57105,0.0,8.051321,9.595812,8.972632,10.112959,0.0,11.246888,8.22259,7.246958,9.081315,8.953153,10.903035,8.412411,7.962115,8.842882,7.726962,11.890472,10.416924,9.588197,0.0,0.0,6.91472,9.499977,0.0,8.415672,9.950925,10.811752,9.112404,8.260061,11.96373,10.422977,8.876143,10.613165,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""+""","""hsa_circ_0006354""","""ENSG00000049245""","""protein_coding""","""VAMP3""",10.44715,9.864488,9.594996,11.390925,10.841407,11.06051,11.157648,8.055886,11.486367,10.976253,10.858834,10.806244,11.627054,10.791553,11.329695,10.403108,9.891999,10.872354,9.993806,10.358615,11.222084,9.685668,11.37978,11.29297,11.284135,9.954313,10.556599,11.50797,11.142744,10.507869,10.352627,11.67958,11.746732,11.594413,11.614074,10.264832,10.950909,12.012178,10.881482,10.511436,8.083976,12.066053,10.903035,9.632381,11.416277,10.688608,10.627939,10.983914,11.475269,10.365024,12.068234,10.981206,10.48868,11.356514,10.62642,10.734216,10.038302,9.725192,10.390977,10.68918,10.214562,11.322953,10.78078,9.72087,11.111333,9.17709
"""-""","""hsa_circ_0001439""","""ENSG00000151466""","""protein_coding""","""SCLT1""",11.905924,11.822696,11.617435,12.481788,12.177967,11.627331,11.904627,11.298955,11.416003,12.455786,11.578421,12.308223,12.595122,10.878968,11.981568,10.70099,10.89124,11.01667,11.742833,11.872474,11.806845,11.305973,11.684531,12.292683,12.283846,10.897031,11.306232,11.971781,11.451949,11.636614,11.755296,12.05799,12.000421,11.764286,11.555199,11.800116,11.038329,11.246888,12.333509,11.761406,10.986251,12.009483,11.902658,11.347323,11.501136,11.376332,11.674774,11.568639,11.875684,11.238995,11.805267,12.665212,11.857298,12.428982,11.36302,12.184341,11.342339,11.116454,12.003231,11.255937,12.426603,12.644543,11.78037,10.08306,12.188992,10.875803
"""-""","""hsa_circ_0001360""","""ENSG00000173889""","""protein_coding""","""PHC3""",11.090634,10.501365,11.30152,10.452818,10.50058,10.113605,10.350767,11.220984,11.264058,11.187659,10.858834,11.028521,10.563421,9.907719,11.107396,9.380362,10.351017,11.01667,10.855665,11.135765,10.338072,10.891126,10.752044,10.445433,10.753878,9.838957,10.620687,10.97214,10.426947,9.678564,11.205293,12.164881,9.048599,11.179531,11.292261,9.850185,7.956001,11.094951,11.277228,11.045466,10.779909,11.066389,11.902658,10.283797,10.179954,10.518792,11.113105,9.984626,8.420087,10.365024,10.80567,10.981206,10.761525,10.95814,10.041914,10.99711,10.857135,11.367889,9.763532,10.68918,10.924583,10.908103,10.574456,11.234331,11.163777,11.345075
"""-""","""hsa_circ_0070039""","""ENSG00000138750""","""protein_coding""","""NUP54""",10.44715,10.394526,9.775349,10.867598,11.052804,11.212446,10.994226,11.138558,10.094866,10.429095,10.47205,10.723829,11.103691,9.529658,11.277249,9.380362,10.476455,10.432056,10.341426,11.457564,11.280953,10.842242,10.292926,9.860988,10.831836,10.061124,10.954917,11.042495,10.263574,11.207927,9.569152,11.605602,11.216404,10.59488,9.971195,10.112959,10.273274,10.925108,10.803522,11.144956,11.327138,10.950968,0.0,10.631472,10.831579,10.104082,10.720992,8.986049,11.918742,10.239592,10.80567,9.759765,10.710929,10.89106,10.62642,10.827273,9.758488,10.046779,10.551329,10.182588,10.278639,10.908103,10.78078,10.720015,10.880121,9.983376


In [190]:
preprocess_seq_data(df=pirna, annotation_cols=["piRNA_name", "piRNA_id"], min_median_expression=5, min_detection_rate=0.2, top_n=500, output_path="mds_data/preprocessed/pirna.csv")

piRNA_name,piRNA_id,N58,N60,N70,N82,N83,N84,N85,V1048,V108,V1090,V125,V1279,V1297,V1321,V1337,V1394,V1441,V1456,V148,V1505,V1528,V1565,V1591,V1592,V1699,V1708,V1742,V1776,V1788,V18,V1800,V1823,V1834,V1857,V1860,V1874,V1884,V1920,V1921,V2089,V2092,V2110,V2133,V221,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V712,V714,V716,V777,V806,V839,V883,V888,V940
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""hsa_piR_020485""","""DQ598159""",8.455423,9.37332,9.075342,9.285851,8.367354,7.807742,9.70487,10.288441,10.105362,8.400049,9.827009,7.327685,8.001009,7.098474,8.501759,7.849765,6.253277,10.224737,9.270382,7.68859,7.95295,10.422287,9.734947,7.500889,9.607077,5.609805,8.907565,9.776856,8.334247,8.37653,8.329637,11.337857,11.704795,16.539829,10.122036,8.689609,10.025685,11.541256,10.354855,10.436414,7.271395,10.588501,10.113679,6.759121,7.952574,8.953183,8.218316,9.438432,8.461454,8.430144,10.38994,11.265676,9.579826,9.055617,9.348097,8.292343,6.995855,10.333094,8.336028,6.861325,6.988991,7.906521,10.203794,8.135731,8.930686,12.191744
"""hsa_piR_013624""","""DQ588594""",5.569475,7.201653,6.196311,6.555731,6.093038,5.927076,8.263922,7.172626,5.457152,5.638433,9.043585,5.963472,5.465458,0.0,0.0,0.0,0.0,7.510618,5.544405,8.84001,5.377385,7.530208,8.240006,5.00257,7.115819,0.0,6.398203,6.849179,5.111586,8.931533,0.0,10.062046,0.0,13.611739,6.44929,5.998363,0.0,8.371306,10.446418,6.007315,0.0,6.827731,8.574596,0.0,5.9539,7.582307,5.424335,5.087113,0.0,5.865945,7.766487,9.290746,6.197799,5.960777,8.219485,5.696907,4.804422,7.76875,5.926628,6.251256,0.0,8.277123,7.552067,6.291152,6.595916,7.592273
"""hsa_piR_008488""","""DQ581533""",5.295862,7.322849,6.115006,5.508234,0.0,0.0,6.770001,6.468483,0.0,0.0,6.985606,0.0,0.0,0.0,5.678229,0.0,0.0,6.464665,5.900089,6.051422,7.320932,6.596264,7.113234,0.0,5.831382,0.0,5.890942,5.913219,5.759095,6.757547,0.0,9.612374,0.0,12.883552,0.0,0.0,6.097163,7.770899,8.730197,5.433523,5.752977,7.990583,8.084279,0.0,6.974377,5.973415,0.0,6.201278,0.0,0.0,6.869534,9.779287,0.0,5.818823,7.584742,7.329895,4.804422,6.514948,7.420174,6.731261,0.0,7.445822,7.25428,7.4104,5.547964,8.339261
"""hsa_piR_014629""","""DQ590023""",0.0,0.0,5.937206,7.69615,0.0,7.94147,6.191633,7.39359,6.88366,5.896632,7.167115,0.0,5.806367,5.689865,0.0,0.0,0.0,0.0,5.439866,6.241345,5.255006,5.553434,7.269697,0.0,8.132897,0.0,7.709832,6.49018,5.111586,5.538491,5.06921,6.932077,7.429411,12.252611,7.226621,6.575784,5.566142,7.020545,7.431398,0.0,6.589035,6.942231,7.176425,0.0,5.098697,7.694217,6.513139,0.0,0.0,0.0,6.259428,8.110448,5.499758,5.818823,0.0,6.828416,4.804422,6.441777,6.221734,6.047716,5.209754,0.0,0.0,0.0,6.165071,7.455517
"""hsa_piR_020008""","""DQ597482""",6.327367,7.283571,6.196311,8.734388,9.026262,7.698625,9.824402,7.754252,5.693098,6.4735,7.940534,0.0,0.0,5.161945,6.050225,4.194154,5.272066,7.841564,0.0,6.988442,0.0,7.232448,8.634789,0.0,10.344895,0.0,9.016054,6.102869,5.966962,8.598302,5.06921,10.196722,0.0,13.418974,0.0,6.439792,5.566142,6.928177,8.556136,6.007315,7.058764,6.157259,7.729623,5.735599,6.841194,10.556089,6.351535,0.0,5.54376,0.0,7.662647,9.3039,5.840833,5.048797,0.0,5.479143,8.39998,6.714522,6.968725,6.5883,0.0,8.739245,7.872457,8.208235,7.588439,10.665394
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""hsa_piR_004962""","""DQ576828""",5.934522,6.512562,6.115006,7.382973,6.645657,7.496144,8.035249,7.39359,6.930572,6.305603,7.297486,6.299634,6.63148,5.598618,5.93656,5.02074,5.586508,7.341682,7.207449,7.551786,6.629914,6.758164,8.108015,6.17854,7.399353,4.997799,7.680826,7.338592,5.328005,6.303106,6.550077,6.385039,7.119486,8.573875,7.080811,6.391455,5.566142,7.601799,7.751653,6.791344,7.659128,7.048308,8.171429,5.088342,5.746066,7.798068,6.91139,6.135098,6.847012,6.853523,7.386623,8.306181,7.687525,6.949149,7.584742,6.630986,6.306246,7.827379,8.065649,6.861325,6.129587,6.057255,8.134465,6.19927,6.715016,7.381949
"""hsa_piR_008113""","""DQ581032""",17.99194,18.059353,18.121401,17.939341,17.984465,18.060176,17.96826,18.010611,18.142645,18.048051,17.750217,18.138738,18.090363,18.284286,18.0287,18.303277,18.25408,17.971729,17.690115,17.968327,17.732295,17.532568,17.524727,18.190968,17.794484,18.574146,17.926159,17.947342,18.309354,17.935515,18.3436,17.937851,17.975105,17.349026,18.014754,18.242076,18.149494,18.379674,17.820362,18.153242,17.899376,17.971932,17.953461,18.248717,18.350903,17.930265,18.146999,18.103772,17.862973,18.250813,16.343952,17.124704,18.10534,18.099882,18.384834,16.613968,18.126076,17.897942,17.961335,17.78429,18.293969,17.835002,17.991383,18.095361,17.850615,17.73303
"""hsa_piR_004271""","""DQ575827""",6.674949,6.703231,6.664723,8.088152,7.61257,8.281271,8.107735,8.245356,7.803861,7.878354,8.441306,6.372718,7.112744,6.994926,7.291284,5.02074,6.129464,7.562803,7.207449,8.583466,6.770566,7.987509,8.831771,6.17854,7.636212,4.997799,8.041833,8.018037,6.408257,7.352468,6.343028,6.865526,8.011581,8.526174,8.100914,6.814463,6.341759,8.598081,8.493114,7.257207,8.109718,7.23959,7.785945,6.995253,6.615081,7.985719,6.949549,6.383348,6.597858,7.526909,8.292387,8.69367,7.408916,7.169872,7.290993,7.193288,7.084444,7.911084,8.294339,7.090811,5.322229,7.046381,7.872457,6.928361,7.588439,8.037737
"""hsa_piR_008112""","""DQ581031""",18.634616,18.917902,18.636449,18.512548,18.648633,18.640015,18.612728,18.038645,18.414233,18.376268,18.438451,18.770815,18.588144,18.585833,18.699543,18.657974,18.458772,18.373511,18.057418,18.673894,18.360574,17.038216,18.272995,18.518511,18.580864,18.353172,18.492592,18.703922,18.478029,17.982453,18.601737,18.781904,18.405321,17.331571,18.577736,18.442358,18.747658,17.186119,18.076761,18.655282,18.591131,18.472669,17.171638,18.323391,18.58923,18.537953,18.675436,18.550556,18.474313,18.453451,17.256454,17.852019,18.911393,18.705712,18.467025,17.381236,18.507624,17.825802,18.424034,18.594778,18.431243,18.451303,18.121203,18.593239,18.669182,17.568038


In [191]:
preprocess_seq_data(te_counts, annotation_cols=["TE"], min_median_expression=10, min_detection_rate=0.4, top_n=1000, output_path="mds_data/preprocessed/te_counts.csv")

TE,N58,N60,N70,N82,N83,N84,N85,V1048,V108,V1090,V125,V1279,V1297,V1321,V1337,V1394,V1441,V1456,V148,V1505,V1528,V1565,V1591,V1592,V1699,V1708,V1742,V1776,V1788,V18,V1800,V1823,V1834,V1857,V1860,V1874,V1884,V1920,V1921,V2089,V2092,V2110,V2133,V221,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V712,V714,V716,V777,V806,V839,V883,V888,V940
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""HERV-K14CI""",2.583999,3.92374,3.382264,4.466435,3.173178,2.70803,3.606353,2.087902,4.305403,3.768617,4.637972,4.723148,5.612011,4.215552,2.846294,2.784579,2.934658,2.56158,2.282875,2.960295,2.346483,1.670561,3.085617,4.489318,4.755815,2.959792,3.792796,4.568955,2.711949,2.411238,2.132183,2.252645,6.435916,7.356509,3.751317,3.783485,3.340499,5.319501,2.891586,1.834446,6.745917,4.347916,8.13262,3.837695,2.341652,3.339166,3.625275,4.479783,3.781263,3.688004,4.151038,4.364928,2.277848,4.151716,1.82812,6.60283,3.943479,4.597205,4.062437,5.606828,5.099781,2.834413,3.477563,4.013689,3.594034,3.264587
"""MER67D""",1.748082,1.880444,2.522531,1.764921,2.061352,0.813822,2.117061,0.805821,2.089398,1.840171,1.939537,2.911081,2.644204,2.826342,2.564698,2.096923,1.267615,1.57136,0.888159,0.835255,1.688803,0.435561,1.217565,1.584234,2.295036,3.323877,1.736522,3.090665,1.691453,0.779482,1.841789,1.769975,3.514084,4.622103,1.200842,2.784794,1.243883,1.920057,2.534839,0.891767,3.506729,1.705223,4.562983,2.663372,0.275017,1.325797,2.192644,3.219695,0.588123,3.260879,1.416638,2.832916,2.187812,0.798008,1.497516,3.541159,1.405592,1.203166,1.322824,1.636374,2.06795,1.594856,1.655074,1.690529,1.993703,1.292548
"""MIR3""",3.449018,2.888113,2.075519,1.863307,1.538173,2.234776,2.257967,4.683059,2.495399,2.7927,2.892948,2.722088,3.118432,1.969459,2.564698,4.46281,2.894311,3.229642,2.13857,3.814503,3.817239,3.882158,4.869146,4.623341,2.820827,2.645675,3.880172,2.402798,3.242364,5.34296,0.797773,4.376147,2.460371,3.425921,2.491674,2.861252,1.438527,1.473804,1.658922,1.360826,3.698297,4.072581,3.855228,2.971474,1.422269,4.637337,2.043154,2.653504,2.467177,1.202173,2.212712,3.305833,5.069399,2.416304,2.351349,3.823512,3.045597,2.761041,3.407344,3.085565,1.327998,2.715116,3.341263,4.115769,2.991377,4.103188
"""HERVK11DI""",4.59405,6.198976,4.481862,4.527752,4.886738,4.08455,4.693326,3.803064,5.790712,5.363506,5.254235,5.360619,5.454212,4.577977,4.856824,4.205859,4.761917,4.230564,4.930048,4.570119,4.367068,4.464505,4.017202,5.26344,5.1554,5.314267,4.31179,5.333378,3.955073,4.065493,4.071671,4.00326,6.429068,7.454761,4.316865,4.322621,4.074065,5.763443,4.839176,3.319358,6.761271,4.174753,7.726508,5.196095,5.739398,3.945341,5.197311,4.935023,5.581674,5.398718,5.924065,4.882003,4.801781,4.147704,4.020499,7.005446,4.591472,6.376757,6.526739,6.700156,4.826894,4.60955,5.046256,4.597786,4.522715,4.731232
"""MER41G""",2.708681,3.202239,1.786384,2.650033,2.996805,2.4458,2.613156,1.697961,2.563577,2.133611,2.313291,2.525701,3.86188,2.09786,2.30248,0.874778,1.853666,0.798625,3.324612,1.745245,1.795347,1.555033,2.257064,2.505926,3.034598,1.586406,2.017903,2.711393,1.675466,1.882419,1.39205,0.785043,3.778998,4.511182,2.217468,2.047269,1.610004,2.621722,2.273843,2.36027,3.0657,2.649657,4.932848,1.961725,1.978238,2.20767,2.114218,2.533051,2.255401,2.368139,2.452703,2.743754,1.993637,1.549919,1.497516,3.745925,1.492736,2.891603,3.08878,3.567886,2.593385,0.715704,2.788859,1.797129,2.235086,1.796145
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""MIR""",6.677831,6.788537,6.314765,6.958926,7.034628,6.721206,6.721739,7.028812,7.244991,7.537448,6.862972,7.235509,7.796649,6.641582,7.090833,6.595181,6.746419,7.578678,6.046585,7.090728,7.086273,6.57355,7.086004,7.431315,6.069955,7.168895,6.247861,6.925978,6.459443,6.534874,6.213921,6.291868,6.700427,7.038677,6.610074,6.964897,6.521484,6.720646,6.76907,5.990081,6.223178,6.802384,6.942107,7.343229,6.510793,6.678142,7.39155,7.171696,6.734482,7.414221,6.272744,7.590237,7.215635,6.922687,5.791339,6.707112,6.18416,7.268394,7.051051,7.160186,6.761338,6.897273,7.337293,6.765664,6.375749,6.65867
"""MER21B""",6.736581,7.147338,6.50934,7.156005,7.439173,7.228509,7.170062,7.158072,7.490544,7.368395,7.01046,7.509508,7.808913,6.91986,7.464746,6.485703,6.675633,7.042765,6.981128,7.071444,6.786404,6.865878,7.105503,7.430674,6.562894,7.336557,6.590473,7.252773,6.712506,6.277688,6.389747,6.555184,6.926358,7.481182,6.919657,7.197915,6.666591,7.114501,7.128351,6.533548,6.710208,6.977285,7.246054,7.424998,6.924656,6.602746,7.544351,7.280468,6.784221,7.679857,6.73463,7.041375,7.129449,6.946477,6.178493,7.04034,6.259128,7.457441,7.18098,7.380183,6.891558,6.997779,7.347763,6.729518,6.794623,6.681524
"""L1M2A_5""",5.711999,6.012728,5.133204,5.902034,5.970653,5.561405,5.52811,5.832065,5.954704,5.953247,5.785469,5.970032,6.814397,5.290574,5.878814,5.632601,5.673087,5.794509,5.274935,5.87879,5.684987,5.570418,6.10774,6.190077,5.079171,5.81799,5.480033,5.766067,5.523319,5.403424,5.39136,5.483667,5.602138,6.027548,5.460362,6.117913,5.246597,5.603302,5.760589,4.523614,5.776451,5.374859,5.630191,6.032637,5.963365,5.266419,6.362627,5.971013,6.02766,6.231501,5.294802,6.203609,6.045706,5.797731,4.93268,5.741278,5.06797,6.071147,6.198235,6.21576,6.053348,6.058959,6.041586,5.673388,5.238056,5.605395
"""L1MD2""",8.267135,8.282533,7.795038,8.296698,8.708906,8.411793,8.279179,8.388446,8.722487,8.760187,8.212911,8.766914,8.983398,8.135709,8.799595,8.073301,8.199823,8.471868,7.819153,8.507468,8.292111,8.110704,8.378757,8.964693,7.48027,8.685181,7.8803,8.563234,7.983076,8.041513,7.752933,7.998287,7.996855,8.451578,8.078586,8.493111,7.956688,8.081241,8.560959,7.489322,8.082708,8.077267,8.442015,8.72514,8.382051,7.843385,8.729707,8.787079,8.17613,9.05596,7.867979,8.700227,8.623746,8.266793,6.744585,8.563574,7.654384,8.705004,8.56488,8.78545,8.45283,8.366543,8.633869,8.265726,7.96139,8.351327


In [192]:
annotations = pl.read_csv("mds_data/preprocessed/annotations.csv")
annotations

SAMPLE_NAME,SAMPLE_ID,GROUP,disease,risk,mutations
str,str,str,i64,i64,i64
"""N54_S14""","""N54""","""CTR""",1,0,0
"""N58_S18""","""N58""","""CTR""",1,0,0
"""N60_S15""","""N60""","""CTR""",1,0,0
"""N70_S16""","""N70""","""CTR""",1,0,0
"""N82_S1""","""N82""","""CTR""",1,0,0
…,…,…,…,…,…
"""V839_S17""","""V839""","""EPI""",2,2,0
"""V883_S4""","""V883""","""SPL/EPI""",2,1,0
"""V888_S2""","""V888""","""SPL""",2,1,2
"""V940_S8""","""V940""","""SPL/EPI""",2,1,0


In [193]:
annot_66 = annotations.filter(
    pl.col("SAMPLE_ID").is_in(common_names)
).sort("SAMPLE_ID").write_csv("mds_data/preprocessed/annotations_66.csv")

In [194]:
mrna[:,:10].head()


GENE_ID,GENE_NAME,N58,N60,N70,N82,N83,N84,N85,V1048
str,str,i64,i64,i64,i64,i64,i64,i64,i64
"""ENSG00000188026""","""RILPL1""",235,244,266,89,232,198,246,404
"""ENSG00000167578""","""RAB4B""",951,856,905,749,865,1441,682,764
"""ENSG00000078237""","""TIGAR""",184,691,258,393,356,288,482,404
"""ENSG00000158486""","""DNAH3""",0,16,8,30,4,10,0,0
"""ENSG00000283967""","""RP11-432M8.3""",1,0,0,2,2,1,2,0
