In [1]:
%load_ext autoreload
%autoreload 2

import polars as pl

from src.seqeuencing_data_preprocessing import preprocess_seq_data

# load and transform annotations

In [2]:
# load sample_sheet
sample_sheet = pl.read_excel("mds_data/raw/sample sheet for CVUT.xlsx")

annotations = (
    sample_sheet.with_columns(
        pl.col("SAMPLE_NAME")
        .str.split("_")
        .map_elements(lambda x: x[0], return_dtype=pl.String)
        .alias("SAMPLE_ID")
    )
    .sort("SAMPLE_ID")
    .drop("RUN", "PLATFORM", "N_UNIQ_MAP_READS", "UNIQ_MAP")
    .rename(
        {
            "1 disease": "disease",
            "2 risk": "risk",
            "3 mutations (SF3B1only_wt)": "mutations",
        }
    )
    .select("SAMPLE_NAME", "SAMPLE_ID", "GROUP", "disease", "risk", "mutations")
)

# annotations.write_csv("mds_data/preprocessed/annotations.csv")

# load mRNA

In [173]:
mrna = pl.read_csv(
    "mds_data/raw/200625_allRNA_fromRNAseq_annot_hg38.tsv", separator="\t"
)
with pl.Config(tbl_cols=-1, tbl_rows=-1):
    print(mrna["GENE_TYPE"].value_counts(sort=True))

shape: (46, 2)
┌─────────────────────────────────┬───────┐
│ GENE_TYPE                       ┆ count │
│ ---                             ┆ ---   │
│ str                             ┆ u32   │
╞═════════════════════════════════╪═══════╡
│ protein_coding                  ┆ 19873 │
│ processed_pseudogene            ┆ 10209 │
│ lincRNA                         ┆ 7430  │
│ antisense                       ┆ 5479  │
│ unprocessed_pseudogene          ┆ 2645  │
│ misc_RNA                        ┆ 2212  │
│ snRNA                           ┆ 1900  │
│ miRNA                           ┆ 1879  │
│ TEC                             ┆ 1065  │
│ snoRNA                          ┆ 943   │
│ sense_intronic                  ┆ 893   │
│ transcribed_unprocessed_pseudo… ┆ 853   │
│ processed_transcript            ┆ 550   │
│ rRNA                            ┆ 542   │
│ transcribed_processed_pseudoge… ┆ 471   │
│ IG_V_pseudogene                 ┆ 188   │
│ sense_overlapping               ┆ 179   │
│ IG_V_gene      

In [8]:
# mrna
mrna = pl.read_csv(
    "mds_data/raw/200625_allRNA_fromRNAseq_annot_hg38.tsv", separator="\t"
)

mrna = mrna.filter(pl.col("GENE_TYPE") == "protein_coding")

mrna = mrna.drop(["CHR", "START", "END", "GENE_TYPE"])
gene_ids = mrna["GENE_ID"]

rename_dict = {col: col.split("_")[0] for col in mrna.columns[2:]}
mrna = mrna.rename(rename_dict)

mrna_names = mrna.columns[2:]
mrna

GENE_ID,GENE_NAME,N54,N58,N60,N70,N82,N83,N84,N85,N86,N87,NV1428,NV911,NV912,V1048,V108,V1090,V1249,V125,V1279,V1297,V1321,V1337,V1394,V1422,V1426,V1441,V1456,V148,V1505,V1528,V1554,V1565,V1577,V1591,V1592,…,V1921,V2089,V2092,V2110,V2133,V221,V2241,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V681,V708,V712,V714,V716,V777,V788,V795,V806,V833,V839,V853,V883,V888,V940,V956,V957
str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""ENSG00000188026""","""RILPL1""",407,235,244,266,89,232,198,246,134,279,131,133,114,404,60,207,196,172,146,256,393,195,110,106,463,128,239,203,223,153,65,384,146,226,210,…,200,120,362,315,93,170,376,120,715,180,314,263,128,426,352,164,538,44,131,432,262,98,65,92,355,167,283,122,458,439,139,260,273,474,364,176,69
"""ENSG00000167578""","""RAB4B""",951,951,856,905,749,865,1441,682,851,681,849,555,454,764,1080,812,1214,710,1111,613,1526,1348,1390,765,1665,527,1259,1026,1317,1183,412,1682,1278,1177,753,…,914,1798,937,812,304,636,1165,1137,1514,1024,920,989,948,2094,1350,1989,1261,460,685,1374,778,831,786,1370,1304,882,3043,1605,801,1118,907,1815,887,1130,813,0,1242
"""ENSG00000078237""","""TIGAR""",580,184,691,258,393,356,288,482,275,277,325,240,329,404,301,240,777,296,404,173,343,504,878,133,374,686,606,485,322,309,151,508,731,488,568,…,332,511,424,218,142,304,511,590,475,564,366,400,219,998,376,817,609,80,342,408,471,336,217,630,1098,243,746,361,499,486,1073,317,328,362,227,242,522
"""ENSG00000158486""","""DNAH3""",18,0,16,8,30,4,10,0,18,9,3,3,10,0,3,0,27,29,0,12,97,0,4,2,31,6,0,195,8,0,8,11,7,3,0,…,0,12,92,17,266,0,7,8,4,3,3,0,2,20,43,0,12,0,134,6,0,4,9,80,23,10,227,0,3,47,109,0,0,19,2,0,17
"""ENSG00000283967""","""RP11-432M8.3""",0,1,0,0,2,2,1,2,1,1,0,0,1,0,1,0,2,3,0,3,2,0,0,0,0,1,0,0,1,2,3,0,2,1,0,…,0,0,7,2,8,0,1,0,6,1,0,0,0,2,0,0,1,0,7,1,0,0,1,0,0,0,1,0,0,2,1,0,4,2,1,1,3
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ENSG00000186115""","""CYP4F2""",152,182,25,144,178,120,105,76,257,203,114,26,57,0,129,487,449,81,381,43,139,32,0,324,60,621,138,255,115,30,5,281,910,50,1116,…,116,22,14,474,58,59,119,248,38,8,11,127,177,8,96,458,157,191,58,213,308,752,481,237,138,7,468,545,11,143,226,70,152,84,171,0,278
"""ENSG00000009694""","""TENM1""",22,32,11,42,20,32,26,25,75,33,19,30,3,11,173,5,54,31,25,46,93,2876,28,26,40,48,7,90,23,17,52,49,55,29,7,…,23,31,78,126,108,138,15,232,231,107,38,7,18,203,39,4,16,6,777,11,59,46,18,54,58,35,97,36,19,44,57,30,23,36,32,0,412
"""ENSG00000123685""","""BATF3""",127,67,134,147,23,216,82,90,51,58,64,17,72,83,86,178,147,124,76,70,50,115,34,100,185,166,128,131,150,73,49,227,239,103,48,…,33,104,109,92,98,315,119,37,223,60,52,74,67,184,182,140,351,25,76,210,185,90,181,168,343,262,165,315,472,215,106,366,137,131,84,92,53
"""ENSG00000105063""","""PPP6R1""",22845,28193,26489,21006,17065,22682,26769,18729,20738,22708,18446,18690,18843,24906,15464,18758,21505,28065,21519,26893,24247,22835,31129,19041,93501,24492,25224,19994,24718,21241,8544,43097,30355,28033,17662,…,17448,34875,15198,22322,4353,14988,20336,32244,31834,21384,15798,18774,17235,29815,30757,27047,23945,7123,14442,23880,29027,27304,13041,20407,23342,15997,46899,40751,27759,24271,16824,33950,23426,30607,25504,0,25170


- accounted for sequencing depth between different samples -> gene counts comparable between different samples now
- computed scale invariant variance for each feature

In [5]:
# mirna genes
mirna_genes = pl.read_csv(
    "mds_data/raw/200625_allRNA_fromRNAseq_annot_hg38.tsv", separator="\t"
)

mirna_genes = mirna_genes.filter(pl.col("GENE_TYPE") == "miRNA")

mirna_genes = mirna_genes.drop(["CHR", "START", "END", "GENE_TYPE"])
gene_ids = mirna_genes["GENE_ID"]

rename_dict = {col: col.split("_")[0] for col in mirna_genes.columns[2:]}
mirna_genes = mirna_genes.rename(rename_dict)

mirna_genes_names = mirna_genes.columns[2:]
mirna_genes

GENE_ID,GENE_NAME,N54,N58,N60,N70,N82,N83,N84,N85,N86,N87,NV1428,NV911,NV912,V1048,V108,V1090,V1249,V125,V1279,V1297,V1321,V1337,V1394,V1422,V1426,V1441,V1456,V148,V1505,V1528,V1554,V1565,V1577,V1591,V1592,…,V1921,V2089,V2092,V2110,V2133,V221,V2241,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V681,V708,V712,V714,V716,V777,V788,V795,V806,V833,V839,V853,V883,V888,V940,V956,V957
str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""ENSG00000263642""","""MIR4802""",3,1,1,0,3,1,3,3,2,0,6,4,3,2,5,0,3,6,4,3,4,3,4,2,3,4,9,1,0,2,0,4,0,1,0,…,1,0,0,0,7,1,4,0,3,4,2,1,3,22,4,13,11,0,2,1,0,0,0,2,0,0,4,0,3,2,5,1,2,3,3,3,0
"""ENSG00000283842""","""MIR4751""",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,…,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1
"""ENSG00000199047""","""MIR378A""",9,3,3,1,4,1,3,2,0,9,4,3,1,2,7,7,8,7,10,2,5,5,5,5,4,5,4,4,2,0,6,7,7,0,11,…,4,4,1,7,4,3,4,1,8,8,4,1,1,12,3,10,5,6,4,6,4,9,12,7,0,5,6,16,11,4,1,5,2,4,1,2,4
"""ENSG00000207559""","""MIR578""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,1,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,4,0
"""ENSG00000207698""","""MIR32""",12,10,16,10,11,13,14,5,6,8,14,4,10,8,15,7,17,16,12,14,6,11,10,24,5,1,9,6,11,16,5,10,5,10,10,…,11,2,4,6,0,5,11,11,8,16,12,11,13,22,4,4,9,3,5,7,11,3,12,12,9,10,7,15,5,6,28,9,8,13,13,17,14
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ENSG00000263414""","""MIR3187""",7,28,33,10,9,9,24,28,20,37,15,22,26,9,9,14,18,48,29,13,9,11,4,9,77,47,38,22,11,8,6,9,41,16,11,…,28,46,12,22,9,24,17,61,23,34,4,8,14,15,6,7,20,6,32,26,113,27,23,10,21,0,31,68,7,18,19,1,7,45,19,0,33
"""ENSG00000265064""","""MIR4692""",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,2,1,2,0,0,0,1,1,0,0,0,2,0,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,7,0,0,3,0,0,1
"""ENSG00000263573""","""MIR4270""",2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,2,0,0,2,0,1,0,0,0,0,1,0,1,0,0,…,0,0,1,0,1,2,0,0,2,2,0,2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,3,0,2,1,0,0,4,2
"""ENSG00000221760""","""MIR548J""",11,10,24,15,2,5,4,0,15,3,3,6,3,5,14,9,20,9,4,13,15,11,8,15,10,9,7,12,0,6,9,3,9,20,22,…,6,5,7,6,2,4,12,12,10,26,17,11,12,22,56,7,9,3,6,3,8,6,16,2,8,9,7,7,11,16,12,14,17,4,12,0,16


# load miRNA

In [6]:
# load mirna
mirna = pl.read_excel("mds_data/raw/final_all_samples_miRNA_seq.xlsx")

mirna_rename_dict = {col: col.split("_")[0] for col in mirna.columns[1:]}
mirna = mirna.rename(mirna_rename_dict)
mirna_names = mirna.columns[1:]

mirna

miRNA,V1565,N58,V1874,V777,N80,V1788,N65,V2368,N81,N59,V2286,V406,V100,N82,V2133,V574,V2115,V1921,V714,V637,V1742,V1744,V2248,V1428,V18,V1857,V839,V912,V1048,V911,V940,V681,V708,N60,N70,V148,…,V1441,V1699,V1297,V1321,V1505,V1249,V1456,V1426,V1394,V1592,V1528,V1591,V833,V1708,V1800,V1776,V1823,V1775,V1834,V2378,V2414,V1860,V1884,V1920,V2322,V2311,V2291,V1957,V2092,V2284,V2278,V2110,V2179,V2147,V2224,V2089,V788
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""hsa-let-7a-2-3p""",23,0,11,12,11,0,13,12,12,36,15,0,35,0,17,10,12,10,13,0,0,18,58,11,12,60,0,17,26,13,18,0,13,0,0,0,…,0,15,10,0,27,0,0,0,0,10,0,18,0,0,10,19,79,12,0,17,24,17,0,12,33,0,13,0,10,12,17,47,0,13,26,20,12
"""hsa-let-7a-3p""",1224,1619,3376,1972,1811,1490,589,1721,2311,1272,1211,1291,280,1455,197,1503,1785,1379,1150,877,1239,5322,1659,1081,4406,431,1859,2536,2123,2414,1062,1164,1876,642,663,1079,…,1970,1087,1096,1382,1564,2106,1657,542,1472,1084,1287,1164,1887,1440,1033,1326,879,2074,1043,3247,1356,1351,1133,242,3349,1745,1555,1015,1132,1667,3147,1072,797,1027,4693,1156,2476
"""hsa-let-7a-5p""",608369,933004,1069405,850554,594247,1153938,272068,927860,608709,612056,445097,781218,122906,565251,118793,305157,1480722,512584,513964,414213,647469,1169076,569087,430376,821909,424407,145849,198109,250684,127034,181953,412468,315491,546646,555368,669588,…,803740,600765,772108,873003,522350,980458,359406,173762,1578499,967461,538431,420551,641671,1141110,1320356,598512,423203,827435,475608,1073692,604248,491170,735693,321751,654808,587831,722210,522961,451919,622551,737114,464613,481072,606788,716735,575703,175859
"""hsa-let-7b-3p""",1001,932,1023,1177,614,1251,285,1086,615,825,617,880,185,796,280,428,1437,571,588,477,682,2064,710,653,1556,460,639,1164,811,964,528,947,695,578,799,1046,…,1406,540,1149,1080,955,1515,567,483,1356,867,894,504,1018,1922,960,678,434,1138,726,1606,857,800,934,402,1031,972,980,648,592,1132,1185,842,766,959,1045,1071,1190
"""hsa-let-7b-5p""",342092,304774,294765,290467,176982,430718,85112,340512,176784,226291,145897,319534,43416,204520,37736,85009,435604,149807,181362,153956,268507,409761,169404,160704,272913,189890,67291,91100,104068,41298,72557,128479,137384,163905,191757,281571,…,324863,195570,401014,305985,171342,412420,125376,125395,653889,414149,191121,140175,196653,639686,607506,170347,151499,285312,179951,392510,187292,148536,284648,23841,199909,210815,244025,61806,150372,254426,242105,175841,174812,279244,212584,214623,47734
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""hsa-miR-98-5p""",3623,6190,11127,5755,7987,7638,3615,4912,8379,6262,5413,5450,1803,3653,1182,8473,10794,6428,3961,4019,4443,18940,5849,3854,16665,2817,1123,979,1967,1027,1221,2805,1337,3109,3969,3602,…,3447,5269,3617,3639,8158,8577,3443,812,8640,3890,6773,6405,4029,3371,6547,4734,5314,6176,3302,6752,7932,8478,4598,611,7403,5955,10223,1818,5579,7627,6885,5565,5088,4885,7689,4639,1691
"""hsa-miR-99a-3p""",145,578,2236,1246,721,1013,403,1020,834,636,206,571,88,891,124,309,1206,545,472,163,1245,1755,318,390,1226,325,286,445,514,429,572,295,541,305,961,440,…,781,337,754,1960,399,591,522,86,1033,302,200,439,1310,546,683,377,132,1758,1193,846,396,439,255,455,860,322,232,1239,392,643,765,260,205,689,1085,243,428
"""hsa-miR-99a-5p""",2325,5912,46227,14205,10895,7921,13504,8229,18405,20334,1753,5294,690,4773,3681,6721,25169,12275,2998,9710,5980,14416,4179,2720,13514,3690,2604,1975,2367,3836,2411,2446,2441,2907,9476,4754,…,7129,3175,12337,15035,2854,9555,2547,1000,9952,2709,1768,2669,19219,5157,5804,3801,1972,13403,13523,8872,7846,6208,1859,3834,5936,3779,2658,12357,3347,9017,8966,3626,2045,5751,10492,1472,2934
"""hsa-miR-99b-3p""",346,489,462,1045,264,385,156,388,267,314,118,377,80,330,79,298,1007,144,305,65,152,568,214,154,261,204,168,193,200,146,276,215,310,199,332,412,…,377,143,554,580,272,342,126,258,368,453,276,279,1007,266,548,179,96,618,287,778,356,155,83,600,358,312,696,1094,168,442,277,171,129,288,348,163,169


In [10]:
len(set(mrna_names).intersection(set(mirna_genes_names)))

86

# load circRNA

In [11]:
circrna = pl.read_csv(
    "mds_data/raw/200625_circRNA_fromRNAseq_annot_hg19.tsv",
    separator="\t",
    null_values=["NA"],
)
# replace all null values for genes with 0
circrna = circrna.with_columns(
    pl.all()
    .exclude(
        "CHR",
        "START",
        "END",
        "TYPE",
        "STRAND",
        "circRNA_ID",
        "GENE_ID",
        "GENE_TYPE",
        "GENE_NAME",
    )
    .fill_null(strategy="zero")
)
# basic filtering
circrna = circrna.filter(
    (pl.col("GENE_TYPE").is_in(["protein_coding", "lincRNA", "antisense"]))
    & (pl.col("TYPE") == "exon")
).drop("CHR", "START", "END", "TYPE")

circrna_rename_dict = {col: col.split("_")[0] for col in circrna.columns[5:]}
circrna = circrna.rename(circrna_rename_dict)
# circrna[:, :9]
circrna_names = circrna.columns[5:]
circrna

STRAND,circRNA_ID,GENE_ID,GENE_TYPE,GENE_NAME,N54,N58,N60,N70,N82,N83,N84,N85,N86,N87,NV1428,NV911,NV912,V1048,V108,V1090,V1249,V125,V1279,V1297,V1321,V1337,V1394,V1422,V1426,V1441,V1456,V148,V1505,V1528,V1554,V1565,…,V1921,V2089,V2092,V2110,V2133,V221,V2241,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V681,V708,V712,V714,V716,V777,V788,V795,V806,V833,V839,V853,V883,V888,V940,V956,V957
str,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""-""","""hsa_circ_0009205""","""ENSG00000188976""","""protein_coding""","""NOC2L""",2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,4,0,0,0,0,0
"""-""","""hsa_circ_0000002""","""ENSG00000078808""","""protein_coding""","""SDF4""",17,23,25,23,22,23,32,27,35,26,23,13,8,13,10,18,23,31,16,15,49,31,104,16,34,23,27,32,25,35,32,28,…,11,68,12,8,0,0,8,38,34,19,19,17,17,36,11,53,23,5,33,41,34,28,10,25,21,33,49,53,29,11,5,123,33,33,18,29,15
"""-""",,"""ENSG00000107404""","""protein_coding""","""DVL1""",2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""-""",,"""ENSG00000160075""","""protein_coding""","""SSU72""",3,0,0,0,0,2,0,2,0,3,0,4,0,3,0,2,0,3,0,4,4,0,0,0,10,0,0,0,0,0,0,4,…,3,0,0,0,0,0,0,3,0,0,0,0,2,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""-""","""hsa_circ_0000007""","""ENSG00000078369""","""protein_coding""","""GNB1""",6,6,0,2,3,4,8,7,4,4,5,3,4,5,6,0,14,3,4,6,9,4,4,2,7,4,0,6,0,2,7,0,…,5,24,0,0,0,5,3,9,2,3,2,5,3,7,4,4,0,3,4,0,4,2,0,5,7,5,0,17,6,0,8,2,0,8,3,2,2
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""+""",,"""ENSG00000182150""","""protein_coding""","""ERCC6L2""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
"""-""",,"""ENSG00000136861""","""protein_coding""","""CDK5RAP2""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
"""+""",,"""ENSG00000107164""","""protein_coding""","""FUBP3""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
"""-""",,"""ENSG00000160271""","""protein_coding""","""RALGDS""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2


# load te counts

In [12]:
te_counts = pl.read_csv("mds_data/raw/TE_counts.csv")
te_counts_rename_dict = {col: col.split("_")[0] for col in te_counts.columns[1:]}
te_counts = te_counts.rename(te_counts_rename_dict)
te_counts_names = te_counts.columns[1:]
te_counts

TE,V1744,V2286,V1776,V220,V108,V2089,V221,V806,V1920,N60,V777,V1048,V1394,V1554,V344,V2110,V630,V2311,V1592,V480,V1884,V1337,NV911,V1249,V1565,V712,V714,N85,V2241,V1708,V18,V833,V1800,V1664,V1426,V839,…,V1591,V716,V655,V2092,V883,V67,NV1428,V1834,V382,N87,V125,V2368,V1775,N54,V2147,V681,V957,V1297,V940,N58,V2414,V637,V2284,V1689,V2248,V1823,V553,V148,V1505,V2322,V624,V2179,V1456,V1742,V1577,V456,V689
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""ALU""",5887.91,5431.56,6959.7,8833.7,8986.45,10564.3,7065.91,7599.44,9787.8,9306.07,6061.99,6385.99,5579.7,4948.74,8698.03,5222.05,6877.89,8274.9,7800.57,5598.86,13599.0,7430.7,6199.62,12059.9,7059.21,6516.82,6303.3,7290.05,6080.92,9530.9,6264.46,10614.5,8582.4,5114.27,10942.3,7881.51,…,6148.82,11432.6,7078.72,4629.16,6081.12,8766.84,8097.26,9892.05,6815.32,7966.3,8590.44,6934.2,8178.09,8622.08,4299.38,9650.8,8026.74,7721.05,6593.15,7019.28,9786.57,7523.42,7382.74,6771.68,6835.69,4946.98,13410.6,7903.22,6618.57,4923.12,6690.64,4851.81,5909.45,9511.53,8769.84,8183.79,5019.56
"""AluJb""",21104.5,19944.4,23285.4,30996.4,27678.3,25346.5,26280.0,28709.9,38261.4,35332.8,22834.9,22606.9,19816.4,17392.3,26701.3,22804.4,24896.5,33242.2,28009.3,20592.4,44735.8,24996.5,21670.0,44371.0,27218.6,23824.4,21455.6,27695.8,23690.8,29308.3,21639.5,40095.7,33905.1,20905.2,35242.6,27292.4,…,24160.7,40157.1,26045.3,18683.8,22348.0,22323.5,25259.1,40239.7,27603.6,27669.4,34593.1,27649.2,20355.4,26817.6,16289.2,35666.1,27516.2,24279.6,23699.4,26406.7,23972.5,21814.8,27724.2,23908.0,29645.6,17890.9,59175.4,32247.8,21067.6,17729.7,24140.9,18699.9,21118.1,35568.6,36400.8,28090.8,17515.2
"""AluJo""",3241.23,3076.36,2831.86,4266.9,2273.46,101.421,3609.03,4184.29,5137.53,4933.68,2941.52,3670.61,2744.18,2217.45,3709.33,3165.13,2666.19,3452.7,3451.88,3147.54,4257.09,3504.48,3348.82,5805.75,3744.24,3006.96,2624.93,3677.09,3107.17,2902.06,2626.65,5094.82,4512.76,2585.24,4192.17,3839.13,…,3312.06,3849.75,3315.78,2532.95,3394.73,74.3837,2740.94,4179.26,3611.5,3584.89,4826.98,3597.06,59.6774,2462.54,2374.59,4874.16,3463.95,2681.86,3666.81,4019.45,33.1035,8.64467,4039.87,3335.72,3797.78,2786.46,7165.52,4382.27,2930.32,2043.76,3233.89,2758.92,2426.03,5215.14,4723.25,3125.83,2670.85
"""AluJr""",863.084,1021.31,943.936,1472.02,1141.14,936.128,1158.42,1238.01,1694.92,1550.93,1003.92,1048.49,688.697,927.231,1286.5,930.15,1063.58,1443.85,1135.73,774.718,1806.28,1002.38,935.308,1769.08,1180.33,1130.95,886.429,1153.25,863.661,1213.18,842.567,1812.97,1278.44,641.212,1534.48,1087.77,…,939.429,1840.33,1299.6,881.944,1015.1,964.59,1032.61,1870.29,1263.94,1316.59,1348.67,1158.08,833.327,1099.74,804.413,1579.61,1213.87,1046.95,1163.09,1220.17,918.734,1019.45,1235.09,1146.1,1277.6,736.021,2560.71,1347.83,802.787,724.04,1000.22,803.441,857.841,1759.8,1563.59,1174.99,554.36
"""AluJr4""",6219.15,5283.81,5759.8,7817.57,6722.98,7226.66,6557.64,7286.93,9023.09,8444.05,5932.29,6158.79,5702.57,4410.33,7028.46,5670.5,6414.47,7431.23,7221.28,5169.38,10841.0,6421.22,5709.1,10130.3,7417.67,5925.11,5720.99,7137.76,5902.87,7221.67,6530.15,9500.76,8615.35,5656.09,9291.27,6929.24,…,6496.37,10055.5,6111.24,4784.68,5684.14,5960.01,6608.18,9185.61,6911.11,7409.45,8564.76,6298.29,5168.7,6563.21,4846.74,8963.76,7385.98,5862.15,6350.26,6707.35,6238.01,5240.83,6716.72,6094.02,6534.11,5106.92,13894.9,7512.87,6093.45,4434.0,6364.48,5424.9,5299.94,8767.59,9004.87,6738.61,4403.27
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""X5A_LINE""",8.0,12.0,5.0,14.0,21.0,5.0,13.0,8.0,11.0,9.0,8.0,8.0,9.0,7.0,12.0,1.0,4.0,5.0,24.0,10.0,3.0,11.0,1.0,14.0,2.0,5.0,3.0,0.0,14.0,12.0,6.0,9.0,7.0,8.0,1.0,16.0,…,11.0,14.0,4.0,6.0,8.0,3.0,10.0,14.0,12.0,2.0,8.0,3.0,15.0,9.0,11.0,5.0,5.0,4.0,14.0,4.0,16.0,1.0,11.0,8.0,7.0,10.0,11.0,7.0,5.0,5.0,4.0,7.0,7.0,6.0,6.0,16.0,6.0
"""X5B_LINE""",3.0,5.0,8.0,5.0,7.0,7.0,7.0,14.0,1.0,13.0,0.0,4.0,2.0,0.0,3.0,10.0,8.0,2.0,5.0,12.0,5.0,3.0,3.0,1.0,8.0,5.0,15.0,11.0,12.0,3.0,12.0,8.0,4.0,16.0,6.0,10.0,…,5.0,19.0,6.0,3.0,15.0,4.0,6.0,0.0,14.0,8.0,8.0,1.0,1.0,10.0,4.0,9.0,6.0,6.0,10.0,8.0,9.0,0.0,11.0,6.0,8.0,1.0,12.0,22.0,1.0,3.0,6.0,2.0,13.0,10.0,4.0,3.0,1.0
"""X6A_LINE""",0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
"""X6B_LINE""",1.0,5.0,3.0,1.0,7.0,5.0,3.0,5.0,0.0,6.0,4.0,5.0,2.0,0.0,3.0,1.0,1.0,4.0,3.0,3.0,0.0,1.0,0.0,0.0,2.0,4.0,8.0,2.0,4.0,1.0,4.0,4.0,5.0,2.0,3.0,4.0,…,1.0,13.0,12.0,11.0,1.0,0.0,2.0,11.0,6.0,5.0,4.0,6.0,6.0,6.0,2.0,4.0,7.0,2.0,1.0,0.0,4.0,0.0,3.0,1.0,3.0,0.0,3.0,4.0,2.0,3.0,5.0,1.0,0.0,2.0,3.0,4.0,5.0


# load pirna (skipped for now)

In [13]:
pirna = pl.read_excel("mds_data/raw/piRNA_counts.xlsx")
pirna_rename_dict = {col: col.split("_")[0] for col in pirna.columns[1:]}
pirna = pirna.rename(pirna_rename_dict)
pirna_names = pirna.columns[1:]

pirna = pirna.with_columns(
    [
        pl.col("piRNA").str.split("/").list.get(0).alias("piRNA_name"),
        pl.col("piRNA").str.split("/").list.get(2).alias("piRNA_id"),
    ]
).drop("piRNA")

pirna

V1565,N58,V1874,V777,N80,V1788,N65,V2368,N81,N59,V2286,V406,V100,N82,V2133,V574,V2115,V1921,V714,V637,V1742,V1744,V2248,V1428,V18,V1857,V839,V912,V1048,V911,V940,V681,V708,N60,N70,V148,V655,…,V1297,V1321,V1505,V1249,V1456,V1426,V1394,V1592,V1528,V1591,V833,V1708,V1800,V1776,V1823,V1775,V1834,V2378,V2414,V1860,V1884,V1920,V2322,V2311,V2291,V1957,V2092,V2284,V2278,V2110,V2179,V2147,V2224,V2089,V788,piRNA_name,piRNA_id
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""hsa_piR_006779""","""DQ579258"""
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""hsa_piR_007653""","""DQ580430"""
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""hsa_piR_009540""","""DQ582872"""
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""hsa_piR_000302""","""DQ570363"""
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"""hsa_piR_000390""","""DQ570472"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
315,274,230,160,333,154,116,192,213,244,83,147,332,275,94,50,239,168,515,14,122,162,188,111,309,573,18,18,50,59,45,98,70,133,203,138,176,…,225,170,193,210,79,42,418,327,144,70,209,246,319,128,88,160,214,139,183,87,94,127,212,78,136,187,67,154,203,222,139,273,161,175,43,"""hsa_piR_020814""","""DQ598650"""
316,458,493,422,279,266,140,417,243,395,220,74,361,450,77,120,715,170,210,12,289,688,336,203,604,137,104,193,270,276,265,202,410,132,225,291,937,…,157,1068,389,405,177,89,1005,578,329,314,574,551,458,187,117,421,669,646,183,250,170,131,635,248,170,864,126,293,571,231,224,281,574,183,120,"""hsa_piR_020815""","""DQ598651"""
2732,1064,1346,610,1598,894,848,528,1525,1060,810,565,824,1131,380,1242,833,1493,2428,146,901,1991,378,714,3829,766,226,175,544,380,632,547,312,589,472,1735,1421,…,383,193,1198,264,610,112,492,475,1173,1241,330,385,292,375,506,325,687,330,1212,777,665,138,1620,357,621,874,573,661,1069,1170,784,749,1192,921,391,"""hsa_piR_020829""","""DQ598677"""
131,177,441,190,287,281,247,164,429,191,119,190,51,158,36,254,325,244,195,152,139,717,376,227,657,43,201,109,104,249,112,246,45,61,111,181,43,…,149,156,276,444,135,24,132,141,199,153,191,135,152,239,130,275,173,222,322,465,188,137,556,240,166,385,133,400,496,261,98,171,398,271,217,"""hsa_piR_021032""","""DQ598918"""


# common samples

In [14]:
annotations = pl.read_csv("mds_data/preprocessed/annotations.csv")
annotated_samples = annotations["SAMPLE_ID"].to_list()

In [37]:
from src.data_cleaning import get_common_elements

common_names = get_common_elements(
    annotated_samples, mrna_names, te_counts_names, circrna_names, mirna_genes_names
)

common_names.sort()

print(
    f"lengths: {len(mrna_names)=}, {len(mirna_names)=}, {len(circrna_names)=}, {len(te_counts_names)=}"
)
len(common_names)

lengths: len(mrna_names)=86, len(mirna_names)=105, len(circrna_names)=86, len(te_counts_names)=112


74

### notes about overlapping data
    - mirna_names, te_counts_names = 98 (+12, most data available)
    - mrna, circrna = 86 (all of mrna data, baseline number of samples)
    - mrna, circrna, te_counts = 85 (-1)
    - mrna, circrna, te_counts, mirna = 77 (-8)
    - mrna_names, pirna_names, circrna_names, te_counts_names = 76 (-9)
    - mrna_names, mirna, pirna_names, circrna_names, te_counts_names = 76 (-9)
    - annotations only exist for 66 (-19) samples
### existing annotations
    - for mrna there are 75 samples with annotations
    - adding mirna_genes, te_counts, circrnas we get 74 annotated samples
    - adding pirnas or mature mirnas we get only 66



- there is data both for mirna pre-cursor genes (in the allRNA dataframe) and both for the mature mirnas (which should be generally less noisy, even though both are correlated)
- an algorithm that can deal with missing values might be very nice here

## select common samples from each df

In [38]:
mrna = mrna.select("GENE_ID", "GENE_NAME", *common_names)
mirna_genes = mirna_genes.select("GENE_ID", "GENE_NAME", *common_names)
# mirna = mirna.select("miRNA", *common_names)
circrna = circrna.select(
    "STRAND", "circRNA_ID", "GENE_ID", "GENE_TYPE", "GENE_NAME", *common_names
)
# pirna = pirna.select("piRNA_name", "piRNA_id", *common_names)
te_counts = te_counts.select("TE", *common_names)

# data preprocessing

In [40]:
preprocess_seq_data(
    df=mrna,
    annotation_cols=["GENE_ID", "GENE_NAME"],
    min_median_expression=100,
    min_detection_rate=0.2,
    top_n=3000,
    output_path="mds_data/preprocessed_74/mrna.csv",
)
preprocess_seq_data(
    df=mirna_genes,
    annotation_cols=["GENE_ID", "GENE_NAME"],
    min_median_expression=5,
    min_detection_rate=0.2,
    top_n=1500,
    output_path="mds_data/preprocessed_74/mirna_genes.csv",
)
# preprocess_seq_data(
#     df=mirna,
#     annotation_cols=["miRNA"],
#     min_median_expression=10,
#     min_detection_rate=0.2,
#     top_n=1500,
#     output_path="mds_data/preprocessed/mirna.csv",
# )
preprocess_seq_data(
    df=circrna,
    annotation_cols=["STRAND", "circRNA_ID", "GENE_ID", "GENE_TYPE", "GENE_NAME"],
    min_median_expression=5,
    min_detection_rate=0.2,
    top_n=1500,
    output_path="mds_data/preprocessed_74/circrna.csv",
)
# preprocess_seq_data(
#     df=pirna,
#     annotation_cols=["piRNA_name", "piRNA_id"],
#     min_median_expression=5,
#     min_detection_rate=0.2,
#     top_n=500,
#     output_path="mds_data/preprocessed/pirna.csv",
# )
preprocess_seq_data(
    te_counts,
    annotation_cols=["TE"],
    min_median_expression=10,
    min_detection_rate=0.4,
    top_n=1000,
    output_path="mds_data/preprocessed_74/te_counts.csv",
)

TE,N54,N58,N60,N70,N82,N83,N84,N85,N86,N87,NV1428,NV911,NV912,V1048,V108,V1090,V125,V1279,V1297,V1321,V1337,V1394,V1422,V1441,V1456,V148,V1505,V1528,V1565,V1591,V1592,V1699,V1708,V1742,V1776,V1788,V18,V1800,V1823,V1834,V1857,V1860,V1874,V1884,V1920,V1921,V2089,V2092,V2110,V2133,V221,V2241,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V712,V714,V716,V777,V806,V839,V883,V888,V940
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""HERV-K14CI""",3.937487,2.584002,3.923744,3.382267,4.466436,3.173179,2.708032,3.606354,3.010747,3.037211,2.456546,1.694801,3.71596,2.087903,4.305413,3.768621,4.637976,4.72315,5.612023,4.215554,2.846295,2.784583,2.713055,2.934661,2.561581,2.282879,2.960296,2.346485,1.670563,3.085617,4.489323,4.755818,2.959795,3.792799,4.568957,2.71195,2.41124,2.132184,2.252647,6.43592,7.356513,3.751318,3.78349,3.340501,5.319504,2.891589,1.834447,6.745921,4.347917,8.132623,3.837697,2.780902,2.341658,3.339169,3.62528,4.479788,3.78127,3.688009,4.151041,4.364931,2.277851,4.151719,1.828125,6.602835,3.943482,4.597211,4.06245,5.60684,5.09979,2.834416,3.477567,4.013692,3.594036,3.264589
"""MER67D""",1.321933,1.748084,1.880447,2.522533,1.764922,2.061353,0.813823,2.117061,1.114159,1.675729,1.682811,1.277497,2.665454,0.805822,2.089406,1.840174,1.939539,2.911083,2.644214,2.826344,2.564699,2.096927,2.172877,1.267616,1.57136,0.888161,0.835256,1.688804,0.435562,1.217565,1.584238,2.295038,3.32388,1.736524,3.090667,1.691454,0.779483,1.84179,1.769977,3.514087,4.622106,1.200842,2.784799,1.243885,1.92006,2.534841,0.891768,3.506732,1.705223,4.562986,2.663373,2.308502,0.275018,1.325799,2.192648,3.2197,0.588125,3.260884,1.416639,2.832918,2.187815,0.79801,1.49752,3.541164,1.405594,1.203169,1.322832,1.636382,2.067956,1.594858,1.655077,1.690531,1.993705,1.29255
"""MIR3""",3.531939,3.449021,2.888116,2.075521,1.863308,1.538173,2.234779,2.257968,2.634713,2.198967,1.868938,2.522876,1.65638,4.683059,2.495408,2.792704,2.892951,2.72209,3.118443,1.969461,2.564699,4.462815,2.256756,2.894314,3.229642,2.138573,3.814505,3.81724,3.882161,4.869147,4.623346,2.82083,2.645678,3.880175,2.402799,3.242365,5.342962,0.797773,4.376151,2.460374,3.425925,2.491675,2.861256,1.438528,1.473806,1.658924,1.360826,3.6983,4.072582,3.855231,2.971476,2.801082,1.422273,4.63734,2.043158,2.653509,2.467183,1.202176,2.212714,3.305835,5.069402,2.416306,2.351354,3.823517,3.0456,2.761046,3.407357,3.085576,1.328003,2.715119,3.341267,4.115771,2.991379,4.103192
"""HERVK11DI""",5.267927,4.594053,6.19898,4.481865,4.527754,4.886739,4.084553,4.693327,4.202179,4.441282,4.74391,4.36458,4.907908,3.803064,5.790722,5.363511,5.254239,5.360621,5.454224,4.577979,4.856826,4.205864,4.636123,4.76192,4.230565,4.930052,4.57012,4.36707,4.464508,4.017202,5.263446,5.155403,5.314271,4.311793,5.333379,3.955074,4.065496,4.071673,4.003263,6.429071,7.454765,4.316867,4.322626,4.074067,5.763446,4.839179,3.319359,6.761274,4.174754,7.726511,5.196096,4.596737,5.739405,3.945344,5.197316,4.935029,5.581682,5.398724,5.924068,4.882005,4.801785,4.147707,4.020505,7.005452,4.591475,6.376762,6.526753,6.700169,4.826903,4.609553,5.04626,4.597789,4.522717,4.731235
"""MER41G""",3.531939,2.708684,3.202243,1.786386,2.650034,2.996806,2.445803,2.613157,2.344202,1.763129,2.438575,1.852397,1.527679,1.697962,2.563586,2.133615,2.313293,2.525702,3.861892,2.097862,2.302481,0.87478,1.659705,1.853668,0.798625,3.324615,1.745246,1.795348,1.555035,2.257064,2.50593,3.034601,1.586409,2.017905,2.711394,1.675466,1.882421,1.392051,0.785045,3.779002,4.511186,2.217469,2.047273,1.610006,2.621725,2.273845,2.360271,3.065703,2.649658,4.932851,1.961726,2.128999,1.978244,2.207672,2.114222,2.533055,2.255407,2.368144,2.452706,2.743757,1.993639,1.549921,1.49752,3.74593,1.492738,2.891608,3.088792,3.567897,2.593393,0.715705,2.788863,1.797131,2.235088,1.796147
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""MER21B""",7.499082,6.736585,7.147342,6.509343,7.156006,7.439174,7.228512,7.170062,7.089038,6.925787,7.224156,6.672556,7.061717,7.158073,7.490554,7.368399,7.010463,7.50951,7.808925,6.919863,7.464747,6.485708,7.078053,6.675636,7.042766,6.981132,7.071446,6.786405,6.865882,7.105503,7.43068,6.562897,7.336561,6.590476,7.252775,6.712507,6.277691,6.389749,6.555187,6.926361,7.481186,6.919658,7.197921,6.666594,7.114504,7.128354,6.533549,6.710212,6.977286,7.246057,7.424999,7.103261,6.924663,6.602748,7.544357,7.280473,6.784228,7.679862,6.734633,7.041377,7.129453,6.946481,6.178499,7.040345,6.259131,7.457446,7.180993,7.380195,6.891567,6.997782,7.347767,6.72952,6.794625,6.681528
"""MER21C""",7.176958,6.794437,6.729784,6.44053,6.748292,7.197959,6.945509,7.012833,7.139716,6.813006,6.975679,6.563663,6.959586,6.882216,7.342616,7.327119,6.931757,7.292381,7.891986,6.757338,7.383854,6.148719,6.825125,6.792151,6.694927,6.471233,7.058533,6.849053,6.533874,6.989247,7.410199,6.41379,7.425949,6.434749,7.086539,6.529792,6.37713,6.271618,6.310131,6.675802,7.328168,6.832752,6.917351,6.401277,6.907314,7.008149,6.152296,6.534993,6.88171,7.197396,7.330953,7.118019,6.424439,6.463245,7.31367,7.174427,6.71682,7.541591,6.33423,6.918335,6.901601,6.45135,5.574959,6.736101,6.248709,7.390252,6.884137,7.327111,6.970788,6.877755,7.232642,6.668495,6.558563,6.781873
"""L1M2A_5""",5.92388,5.712003,6.012732,5.133207,5.902036,5.970654,5.561409,5.528111,5.775082,5.440256,5.583994,5.208998,5.564816,5.832065,5.954714,5.953251,5.785472,5.970033,6.814409,5.290576,5.878815,5.632606,5.973387,5.67309,5.794509,5.274939,5.878792,5.684988,5.570421,6.107741,6.190083,5.079174,5.817994,5.480037,5.766069,5.52332,5.403426,5.391362,5.483671,5.602141,6.027552,5.460364,6.117918,5.2466,5.603305,5.760593,4.523616,5.776454,5.37486,5.630194,6.032638,5.779056,5.963373,5.266421,6.362632,5.971018,6.027667,6.231507,5.294805,6.203612,6.04571,5.797734,4.932686,5.741284,5.067973,6.071152,6.198248,6.215773,6.053357,6.058962,6.04159,5.673391,5.238059,5.605399
"""L1MD2""",8.74393,8.267138,8.282537,7.795041,8.2967,8.708908,8.411796,8.279179,8.448783,8.164151,8.442761,7.981906,8.399139,8.388447,8.722497,8.760192,8.212915,8.766916,8.98341,8.135711,8.799596,8.073306,8.196533,8.199826,8.471869,7.819157,8.50747,8.292112,8.110707,8.378757,8.964698,7.480273,8.685185,7.880304,8.563236,7.983077,8.041516,7.752934,7.99829,7.996859,8.451582,8.078587,8.493116,7.956691,8.081244,8.560962,7.489324,8.082711,8.077268,8.442019,8.725141,8.221338,8.382058,7.843388,8.729712,8.787085,8.176138,9.055966,7.867982,8.70023,8.623749,8.266797,6.744592,8.56358,7.654387,8.705009,8.564893,8.785463,8.452839,8.366546,8.633874,8.265729,7.961393,8.35133


In [192]:
annotations = pl.read_csv("mds_data/preprocessed/annotations.csv")
annotations

SAMPLE_NAME,SAMPLE_ID,GROUP,disease,risk,mutations
str,str,str,i64,i64,i64
"""N54_S14""","""N54""","""CTR""",1,0,0
"""N58_S18""","""N58""","""CTR""",1,0,0
"""N60_S15""","""N60""","""CTR""",1,0,0
"""N70_S16""","""N70""","""CTR""",1,0,0
"""N82_S1""","""N82""","""CTR""",1,0,0
…,…,…,…,…,…
"""V839_S17""","""V839""","""EPI""",2,2,0
"""V883_S4""","""V883""","""SPL/EPI""",2,1,0
"""V888_S2""","""V888""","""SPL""",2,1,2
"""V940_S8""","""V940""","""SPL/EPI""",2,1,0


In [193]:
annot_66 = (
    annotations.filter(pl.col("SAMPLE_ID").is_in(common_names))
    .sort("SAMPLE_ID")
    .write_csv("mds_data/preprocessed/annotations_66.csv")
)

In [42]:
annot_74 = (
    annotations.filter(pl.col("SAMPLE_ID").is_in(common_names))
    .sort("SAMPLE_ID")
    .write_csv("mds_data/preprocessed_74/annotations_74.csv")
)

In [194]:
mrna[:, :10].head()

GENE_ID,GENE_NAME,N58,N60,N70,N82,N83,N84,N85,V1048
str,str,i64,i64,i64,i64,i64,i64,i64,i64
"""ENSG00000188026""","""RILPL1""",235,244,266,89,232,198,246,404
"""ENSG00000167578""","""RAB4B""",951,856,905,749,865,1441,682,764
"""ENSG00000078237""","""TIGAR""",184,691,258,393,356,288,482,404
"""ENSG00000158486""","""DNAH3""",0,16,8,30,4,10,0,0
"""ENSG00000283967""","""RP11-432M8.3""",1,0,0,2,2,1,2,0
