In [1]:
%load_ext autoreload
%autoreload 2

import polars as pl
import numpy as np
from src.preprocessing import preprocess_rnaseq_data

# load and transform annotations

In [31]:
# load sample_sheet
sample_sheet = pl.read_excel("mds_data/raw/sample sheet for CVUT.xlsx")

annotations = (
    sample_sheet.with_columns(
        pl.col("SAMPLE_NAME")
            .str.split("_")
            .map_elements(lambda x: x[0], return_dtype=pl.String)
            .alias("SAMPLE_ID")
    )
    .sort("SAMPLE_ID")
    .drop("RUN", "PLATFORM", "N_UNIQ_MAP_READS", "UNIQ_MAP")
    .rename(
        {"1 disease": "disease", "2 risk" : "risk", "3 mutations (SF3B1only_wt)" : "mutations"}
    )
    .select("SAMPLE_NAME","SAMPLE_ID","GROUP","disease","risk","mutations")
)

# annotations.write_csv("mds_data/preprocessed/annotations.csv")

# load mRNA

In [12]:
# mrna
mrna = pl.read_csv("mds_data/raw/200625_allRNA_fromRNAseq_annot_hg38.tsv", separator="\t")
rnaseq = mrna.filter(pl.col("GENE_TYPE") == "protein_coding")

rnaseq_samples = rnaseq.drop(["CHR", "START", "END", "GENE_TYPE"])
gene_ids = rnaseq["GENE_ID"]

In [19]:
sample_names = rnaseq_samples.columns[2:]

rename_dict = {col: col.split("_")[0] for col in sample_names}
rnaseq_samples = rnaseq_samples.rename(rename_dict)

GENE_ID,GENE_NAME,N54,N58,N60,N70,N82,N83,N84,N85,N86,N87,NV1428,NV911,NV912,V1048,V108,V1090,V1249,V125,V1279,V1297,V1321,V1337,V1394,V1422,V1426,V1441,V1456,V148,V1505,V1528,V1554,V1565,V1577,V1591,V1592,…,V1921,V2089,V2092,V2110,V2133,V221,V2241,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V681,V708,V712,V714,V716,V777,V788,V795,V806,V833,V839,V853,V883,V888,V940,V956,V957
str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""ENSG00000188026""","""RILPL1""",407,235,244,266,89,232,198,246,134,279,131,133,114,404,60,207,196,172,146,256,393,195,110,106,463,128,239,203,223,153,65,384,146,226,210,…,200,120,362,315,93,170,376,120,715,180,314,263,128,426,352,164,538,44,131,432,262,98,65,92,355,167,283,122,458,439,139,260,273,474,364,176,69
"""ENSG00000167578""","""RAB4B""",951,951,856,905,749,865,1441,682,851,681,849,555,454,764,1080,812,1214,710,1111,613,1526,1348,1390,765,1665,527,1259,1026,1317,1183,412,1682,1278,1177,753,…,914,1798,937,812,304,636,1165,1137,1514,1024,920,989,948,2094,1350,1989,1261,460,685,1374,778,831,786,1370,1304,882,3043,1605,801,1118,907,1815,887,1130,813,0,1242
"""ENSG00000078237""","""TIGAR""",580,184,691,258,393,356,288,482,275,277,325,240,329,404,301,240,777,296,404,173,343,504,878,133,374,686,606,485,322,309,151,508,731,488,568,…,332,511,424,218,142,304,511,590,475,564,366,400,219,998,376,817,609,80,342,408,471,336,217,630,1098,243,746,361,499,486,1073,317,328,362,227,242,522
"""ENSG00000158486""","""DNAH3""",18,0,16,8,30,4,10,0,18,9,3,3,10,0,3,0,27,29,0,12,97,0,4,2,31,6,0,195,8,0,8,11,7,3,0,…,0,12,92,17,266,0,7,8,4,3,3,0,2,20,43,0,12,0,134,6,0,4,9,80,23,10,227,0,3,47,109,0,0,19,2,0,17
"""ENSG00000283967""","""RP11-432M8.3""",0,1,0,0,2,2,1,2,1,1,0,0,1,0,1,0,2,3,0,3,2,0,0,0,0,1,0,0,1,2,3,0,2,1,0,…,0,0,7,2,8,0,1,0,6,1,0,0,0,2,0,0,1,0,7,1,0,0,1,0,0,0,1,0,0,2,1,0,4,2,1,1,3
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ENSG00000186115""","""CYP4F2""",152,182,25,144,178,120,105,76,257,203,114,26,57,0,129,487,449,81,381,43,139,32,0,324,60,621,138,255,115,30,5,281,910,50,1116,…,116,22,14,474,58,59,119,248,38,8,11,127,177,8,96,458,157,191,58,213,308,752,481,237,138,7,468,545,11,143,226,70,152,84,171,0,278
"""ENSG00000009694""","""TENM1""",22,32,11,42,20,32,26,25,75,33,19,30,3,11,173,5,54,31,25,46,93,2876,28,26,40,48,7,90,23,17,52,49,55,29,7,…,23,31,78,126,108,138,15,232,231,107,38,7,18,203,39,4,16,6,777,11,59,46,18,54,58,35,97,36,19,44,57,30,23,36,32,0,412
"""ENSG00000123685""","""BATF3""",127,67,134,147,23,216,82,90,51,58,64,17,72,83,86,178,147,124,76,70,50,115,34,100,185,166,128,131,150,73,49,227,239,103,48,…,33,104,109,92,98,315,119,37,223,60,52,74,67,184,182,140,351,25,76,210,185,90,181,168,343,262,165,315,472,215,106,366,137,131,84,92,53
"""ENSG00000105063""","""PPP6R1""",22845,28193,26489,21006,17065,22682,26769,18729,20738,22708,18446,18690,18843,24906,15464,18758,21505,28065,21519,26893,24247,22835,31129,19041,93501,24492,25224,19994,24718,21241,8544,43097,30355,28033,17662,…,17448,34875,15198,22322,4353,14988,20336,32244,31834,21384,15798,18774,17235,29815,30757,27047,23945,7123,14442,23880,29027,27304,13041,20407,23342,15997,46899,40751,27759,24271,16824,33950,23426,30607,25504,0,25170


- accounted for sequencing depth between different samples -> gene counts comparable between different samples now
- computed scale invariant variance for each feature

In [15]:
mrna = pl.read_csv("mds_data/preprocessed/mrna.csv")
mrna

GENE_ID,GENE_NAME,N54,N58,N60,N70,N82,N83,N84,N85,N86,N87,NV1428,NV911,NV912,V1048,V108,V1090,V1249,V125,V1279,V1297,V1321,V1337,V1394,V1422,V1426,V1441,V1456,V148,V1505,V1528,V1554,V1565,V1577,V1591,V1592,…,V1921,V2089,V2092,V2110,V2133,V221,V2241,V344,V359,V406,V456,V513,V538,V553,V574,V624,V630,V637,V655,V67,V681,V708,V712,V714,V716,V777,V788,V795,V806,V833,V839,V853,V883,V888,V940,V956,V957
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""ENSG00000119862""","""LGALSL""",4.609742,6.305122,7.34166,5.078978,2.327503,5.038414,4.830448,3.998894,6.377012,6.009781,2.580375,4.950584,4.330228,5.575172,4.88938,6.762901,6.795019,5.40077,4.383507,6.844055,6.498622,3.872351,5.664515,7.466903,5.681666,3.938071,5.605586,3.997065,8.245561,7.213235,5.015115,5.197198,4.312844,8.174119,5.537239,…,5.944051,2.691972,5.978596,5.227769,5.196696,4.806615,7.410495,4.408127,7.558337,6.46822,4.809567,8.575381,5.523641,3.62337,11.841561,4.792746,5.622478,4.813476,6.413924,6.560457,5.493026,5.168636,4.778831,4.347013,4.539656,4.68481,4.507909,6.209175,5.710011,4.971005,5.718705,4.468713,6.368104,5.004319,6.103638,4.013492,1.550629
"""ENSG00000115461""","""IGFBP5""",0.383785,0.884401,3.852909,0.88264,2.574637,3.141027,4.55191,3.435569,7.631181,3.968279,1.259825,2.112607,3.167677,6.566436,1.877367,5.089771,1.486982,3.176264,3.511324,4.065442,4.701745,2.37554,4.916427,5.486655,1.68375,2.128038,3.682054,7.570638,2.46354,2.434868,4.242377,7.007405,2.672262,5.752307,6.451888,…,3.174334,3.480782,4.90968,5.736111,6.013312,3.668968,4.37315,2.686858,5.199256,1.793209,1.379537,2.448284,3.619038,2.744246,3.423066,3.606244,4.171163,1.353894,2.619673,10.724792,4.993573,2.168894,1.665129,1.174696,0.817695,3.324859,2.253932,4.925828,5.785254,1.986328,1.818282,3.137375,3.344353,4.169854,5.635978,4.013492,5.064197
"""ENSG00000173083""","""HPSE""",5.62365,5.789863,6.852178,5.970369,4.012532,5.061277,4.842021,4.230021,5.692444,5.303934,3.842874,4.496573,5.321763,5.432615,6.554397,6.897859,5.812747,5.720641,5.107516,6.361615,5.745989,4.9728,5.093704,6.662515,5.612576,5.375189,5.064361,5.172437,7.324857,6.518994,4.827037,5.570144,5.045736,7.347978,5.397726,…,6.131655,4.631986,5.985651,6.093251,6.067702,5.638232,6.197849,3.588647,7.199151,5.569124,5.256233,6.70368,6.119288,5.981056,11.138551,4.842565,7.062777,4.858389,5.753053,6.474508,4.561179,4.534978,4.62575,4.452047,4.93086,6.000104,4.11464,4.209457,6.024754,4.741412,6.326534,5.279811,5.861999,5.799159,5.952677,5.099498,4.654609
"""ENSG00000137801""","""THBS1""",8.445765,9.62882,11.310243,8.141002,6.105598,8.374229,8.632538,7.122861,10.94922,9.185084,4.454176,8.791534,8.699318,9.567764,8.71589,9.73339,9.896606,8.825487,7.496517,10.059819,9.130098,6.600726,8.688134,11.527945,8.371418,8.984096,10.193962,7.111358,11.197505,10.294274,7.822015,7.91981,9.238904,11.457146,7.425317,…,9.976953,3.702249,9.530873,9.798192,8.468951,7.944044,10.728624,6.882848,11.356415,8.52076,8.117116,10.432644,9.264041,6.058134,14.660534,7.416455,9.615201,7.524854,8.626543,10.343842,7.870944,9.527053,5.380756,5.313661,7.804956,6.403641,6.940197,10.247806,9.236112,7.179462,9.141851,6.889029,9.714324,9.391288,9.912739,0.0,1.909334
"""ENSG00000101162""","""TUBB1""",5.024264,7.281474,9.105877,5.816877,3.699451,5.908215,6.35398,5.180819,8.091847,7.620279,3.373536,5.736305,5.912057,6.520498,6.120521,7.790017,7.270352,7.471383,4.811169,7.451022,7.850294,4.899481,6.450811,8.705624,6.297681,5.304556,5.701465,4.412993,9.21867,8.899784,6.052156,5.061337,5.375059,9.482943,4.317056,…,7.193222,4.992064,6.948275,7.409178,5.812949,4.555432,10.374933,3.551601,8.691506,6.428368,5.649217,10.906883,6.985803,5.190295,12.500251,4.859498,7.149664,3.488672,7.652105,7.782204,5.198999,5.437756,3.715916,4.731629,4.753585,2.62394,4.843387,7.603708,7.288539,3.128868,7.20109,2.559156,6.440348,6.62782,6.989698,0.0,2.174519
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ENSG00000121406""","""ZNF549""",7.159546,6.967168,6.639957,7.219407,6.592858,6.927599,6.543102,6.219506,6.158718,6.085332,6.795012,6.457562,6.591067,6.90825,7.304494,6.61999,7.195222,6.880289,7.196136,8.417588,7.031883,7.069739,7.016209,6.544977,8.27008,6.874759,6.555127,7.004077,6.996441,6.987034,7.233486,7.056999,7.189168,6.977688,6.755756,…,6.999319,6.458061,6.476127,6.657079,7.029764,6.980602,6.950234,6.874747,6.442704,6.942585,7.457708,7.314162,7.173207,6.974017,6.479216,6.985913,6.823911,6.50115,7.248639,6.926332,6.669926,6.790663,7.180936,7.051842,7.288007,7.756547,7.236971,6.942456,6.382392,7.324955,7.167454,7.301487,6.936964,6.586231,6.417766,0.0,7.14822
"""ENSG00000173480""","""ZNF417""",7.520442,7.374774,7.056085,7.450963,7.373246,7.386739,6.911519,7.200952,6.946109,6.760108,6.929212,6.759575,7.104456,7.11795,7.841271,7.555765,8.077603,7.575516,7.58034,9.147917,7.384709,7.416049,7.186646,7.131651,8.833284,7.157249,7.02171,7.658366,7.011337,7.065439,8.163415,7.291184,7.381108,7.062617,7.568578,…,7.369259,6.896857,6.650506,6.891521,7.501699,7.658794,7.352647,7.135835,7.011845,7.919243,7.985993,7.595348,8.009563,7.790945,6.848137,7.181229,7.291761,7.4896,7.638197,7.252011,7.377859,7.276841,7.805223,7.333828,7.596943,7.897366,7.217369,7.438577,7.108766,7.530069,7.608932,7.184265,7.507625,7.031437,7.013452,0.0,7.490398
"""ENSG00000073536""","""NLE1""",7.233477,7.644225,7.364629,7.60388,6.976537,7.156637,7.148057,7.057194,6.929481,7.082549,7.423584,7.348293,7.64609,7.486036,7.532657,7.551296,7.232083,7.933804,7.596337,7.241443,7.567622,7.099005,7.092394,7.696935,7.245986,7.427018,7.618814,6.978145,7.199421,7.171413,6.704976,8.626248,7.755269,7.163312,7.600747,…,7.366979,7.313036,7.461057,7.657945,7.906064,7.52206,7.343676,7.447016,7.721676,7.377811,7.435891,7.40309,6.98498,7.062313,5.965873,7.072876,7.014446,6.924572,7.555338,7.969748,7.501582,7.631108,7.558663,7.872811,7.588393,7.668467,8.056605,7.723613,7.63864,7.589309,7.154719,7.939297,7.784572,7.634957,7.37435,0.0,7.259458
"""ENSG00000100852""","""ARHGAP5""",9.615983,8.944406,9.037929,9.532646,9.862554,9.816559,7.921965,9.805586,9.660742,9.28513,9.71981,9.346066,7.790377,9.409595,6.637992,8.046513,9.370198,9.074336,8.536449,9.749852,9.181239,9.105582,9.830294,9.155843,9.025563,9.857038,8.871359,9.277742,8.398815,9.383191,10.020691,8.934665,9.659212,8.836566,9.921786,…,9.619468,9.131779,9.00562,9.082912,8.555952,9.532174,9.282022,8.335274,8.95493,8.209931,9.722139,8.41057,7.789924,7.924109,7.569714,9.108189,9.442059,9.173961,8.365445,8.982243,8.975724,9.223921,9.997265,9.668165,10.048545,8.744432,7.529946,9.635188,9.103926,9.683704,9.60535,8.571407,9.154116,8.815191,8.973538,9.524952,9.738131


# load miRNA

In [32]:
# load mirna
mirna = pl.read_excel("mds_data/raw/final_all_samples_miRNA_seq.xlsx")

In [33]:
mirna_rename_dict = {
    col: col.split("_")[0] for col in mirna.columns[1:]
}
mirna = mirna.rename(mirna_rename_dict)
mirna

miRNA,V1565,N58,V1874,V777,N80,V1788,N65,V2368,N81,N59,V2286,V406,V100,N82,V2133,V574,V2115,V1921,V714,V637,V1742,V1744,V2248,V1428,V18,V1857,V839,V912,V1048,V911,V940,V681,V708,N60,N70,V148,…,V1441,V1699,V1297,V1321,V1505,V1249,V1456,V1426,V1394,V1592,V1528,V1591,V833,V1708,V1800,V1776,V1823,V1775,V1834,V2378,V2414,V1860,V1884,V1920,V2322,V2311,V2291,V1957,V2092,V2284,V2278,V2110,V2179,V2147,V2224,V2089,V788
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""hsa-let-7a-2-3p""",23,0,11,12,11,0,13,12,12,36,15,0,35,0,17,10,12,10,13,0,0,18,58,11,12,60,0,17,26,13,18,0,13,0,0,0,…,0,15,10,0,27,0,0,0,0,10,0,18,0,0,10,19,79,12,0,17,24,17,0,12,33,0,13,0,10,12,17,47,0,13,26,20,12
"""hsa-let-7a-3p""",1224,1619,3376,1972,1811,1490,589,1721,2311,1272,1211,1291,280,1455,197,1503,1785,1379,1150,877,1239,5322,1659,1081,4406,431,1859,2536,2123,2414,1062,1164,1876,642,663,1079,…,1970,1087,1096,1382,1564,2106,1657,542,1472,1084,1287,1164,1887,1440,1033,1326,879,2074,1043,3247,1356,1351,1133,242,3349,1745,1555,1015,1132,1667,3147,1072,797,1027,4693,1156,2476
"""hsa-let-7a-5p""",608369,933004,1069405,850554,594247,1153938,272068,927860,608709,612056,445097,781218,122906,565251,118793,305157,1480722,512584,513964,414213,647469,1169076,569087,430376,821909,424407,145849,198109,250684,127034,181953,412468,315491,546646,555368,669588,…,803740,600765,772108,873003,522350,980458,359406,173762,1578499,967461,538431,420551,641671,1141110,1320356,598512,423203,827435,475608,1073692,604248,491170,735693,321751,654808,587831,722210,522961,451919,622551,737114,464613,481072,606788,716735,575703,175859
"""hsa-let-7b-3p""",1001,932,1023,1177,614,1251,285,1086,615,825,617,880,185,796,280,428,1437,571,588,477,682,2064,710,653,1556,460,639,1164,811,964,528,947,695,578,799,1046,…,1406,540,1149,1080,955,1515,567,483,1356,867,894,504,1018,1922,960,678,434,1138,726,1606,857,800,934,402,1031,972,980,648,592,1132,1185,842,766,959,1045,1071,1190
"""hsa-let-7b-5p""",342092,304774,294765,290467,176982,430718,85112,340512,176784,226291,145897,319534,43416,204520,37736,85009,435604,149807,181362,153956,268507,409761,169404,160704,272913,189890,67291,91100,104068,41298,72557,128479,137384,163905,191757,281571,…,324863,195570,401014,305985,171342,412420,125376,125395,653889,414149,191121,140175,196653,639686,607506,170347,151499,285312,179951,392510,187292,148536,284648,23841,199909,210815,244025,61806,150372,254426,242105,175841,174812,279244,212584,214623,47734
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""hsa-miR-98-5p""",3623,6190,11127,5755,7987,7638,3615,4912,8379,6262,5413,5450,1803,3653,1182,8473,10794,6428,3961,4019,4443,18940,5849,3854,16665,2817,1123,979,1967,1027,1221,2805,1337,3109,3969,3602,…,3447,5269,3617,3639,8158,8577,3443,812,8640,3890,6773,6405,4029,3371,6547,4734,5314,6176,3302,6752,7932,8478,4598,611,7403,5955,10223,1818,5579,7627,6885,5565,5088,4885,7689,4639,1691
"""hsa-miR-99a-3p""",145,578,2236,1246,721,1013,403,1020,834,636,206,571,88,891,124,309,1206,545,472,163,1245,1755,318,390,1226,325,286,445,514,429,572,295,541,305,961,440,…,781,337,754,1960,399,591,522,86,1033,302,200,439,1310,546,683,377,132,1758,1193,846,396,439,255,455,860,322,232,1239,392,643,765,260,205,689,1085,243,428
"""hsa-miR-99a-5p""",2325,5912,46227,14205,10895,7921,13504,8229,18405,20334,1753,5294,690,4773,3681,6721,25169,12275,2998,9710,5980,14416,4179,2720,13514,3690,2604,1975,2367,3836,2411,2446,2441,2907,9476,4754,…,7129,3175,12337,15035,2854,9555,2547,1000,9952,2709,1768,2669,19219,5157,5804,3801,1972,13403,13523,8872,7846,6208,1859,3834,5936,3779,2658,12357,3347,9017,8966,3626,2045,5751,10492,1472,2934
"""hsa-miR-99b-3p""",346,489,462,1045,264,385,156,388,267,314,118,377,80,330,79,298,1007,144,305,65,152,568,214,154,261,204,168,193,200,146,276,215,310,199,332,412,…,377,143,554,580,272,342,126,258,368,453,276,279,1007,266,548,179,96,618,287,778,356,155,83,600,358,312,696,1094,168,442,277,171,129,288,348,163,169


# load circRNA

In [43]:
circrna

CHR,START,END,TYPE,STRAND,circRNA_ID,GENE_ID,GENE_TYPE,GENE_NAME,N54_S14,N58_S18,N60_S15,N70_S16,N82_S1,N83_S3,N84_S4,N85_S19,N86_S13,N87_S20,NV1428_S3,NV911_S19,NV912_S20,V1048_S12,V108_S5,V1090_S8,V1249_S13,V125_S5,V1279_S21,V1297_S10,V1321_S4,V1337_S3,V1394_S4,V1422_S6,V1426_S9,V1441_S2,V1456_S13,V148_S10,…,V1921_S15,V2089_S9,V2092_S3,V2110_S16,V2133_S17,V221_S19,V2241_S15,V344_S16,V359_S3,V406_S18,V456_S4,V513_S16,V538_S12,V553_S11,V574_S9,V624_S15,V630_S11,V637_S21,V655_S18,V67_S6,V681_S10,V708_S17,V712_S22,V714_S19,V716_S1,V777_S20,V788_S11,V795_S8,V806_S1,V833_S12,V839_S17,V853_S7,V883_S4,V888_S2,V940_S8,V956_S6,V957_S2
str,i64,i64,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""chr1""",567534,567677,"""exon""","""-""",,"""ENSG00000237973""","""unprocessed_pseudogene""","""MTCO1P12""",2,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""chr1""",741178,745550,"""exon""","""-""","""hsa_circ_0002333""","""ENSG00000230092""","""transcribed_unprocessed_pseudo…","""AL669831.4""",9,,6,18,4,5,13,17,27,6,9,,5,4,5,5,5,3,6,14,37,8,51,6,7,14,7,15,…,6,23,,,,,5,14,8,,12,,16,,,12,,,2,13,10,7,12,11,16,11,15,13,48,11,6,27,5,11,,10,4
"""chr1""",891302,892653,"""exon""","""-""","""hsa_circ_0009205""","""ENSG00000188976""","""protein_coding""","""NOC2L""",2,,,,,,,,2,,,,,,,,,,,,,,,,2,,,,…,,,,,,,,,3,,,,,,,,,,,,,,,,,,3,,,,,4,,,,,
"""chr1""",1158623,1159348,"""exon""","""-""","""hsa_circ_0000002""","""ENSG00000078808""","""protein_coding""","""SDF4""",17,23,25,23,22,23,32,27,35,26,23,13,8,13,10,18,23,31,16,15,49,31,104,16,34,23,27,32,…,11,68,12,8,,,8,38,34,19,19,17,17,36,11,53,23,5,33,41,34,28,10,25,21,33,49,53,29,11,5,123,33,33,18,29,15
"""chr1""",1270866,1271085,"""exon""","""-""",,"""ENSG00000107404""","""protein_coding""","""DVL1""",2,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr9""",123301318,123330666,"""exon""","""-""",,"""ENSG00000136861""","""protein_coding""","""CDK5RAP2""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2
"""chr9""",133485340,133510125,"""exon""","""+""",,"""ENSG00000107164""","""protein_coding""","""FUBP3""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2
"""chr9""",135985012,135987539,"""exon""","""-""",,"""ENSG00000160271""","""protein_coding""","""RALGDS""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2
"""chr9""",140360747,140360953,"""intron""","""-""",,"""ENSG00000130653""","""protein_coding""","""PNPLA7""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2


In [70]:
circrna = pl.read_csv("mds_data/raw/200625_circRNA_fromRNAseq_annot_hg19.tsv", separator="\t", null_values=["NA"])
# replace all null values for genes with 0
circrna = circrna.with_columns(pl.all().exclude("CHR","START", "END", "TYPE", "STRAND",	"circRNA_ID", "GENE_ID", "GENE_TYPE", "GENE_NAME").fill_null(strategy="zero"))
# basic filtering
circrna = circrna.filter(
   (pl.col("GENE_TYPE").is_in(["protein_coding", "lincRNA", "antisense"])) &
   (pl.col("TYPE") == "exon")
).drop("CHR", "START", "END", "TYPE")
circrna[:, :9]

STRAND,circRNA_ID,GENE_ID,GENE_TYPE,GENE_NAME,N54_S14,N58_S18,N60_S15,N70_S16
str,str,str,str,str,i64,i64,i64,i64
"""-""","""hsa_circ_0009205""","""ENSG00000188976""","""protein_coding""","""NOC2L""",2,0,0,0
"""-""","""hsa_circ_0000002""","""ENSG00000078808""","""protein_coding""","""SDF4""",17,23,25,23
"""-""",,"""ENSG00000107404""","""protein_coding""","""DVL1""",2,0,0,0
"""-""",,"""ENSG00000160075""","""protein_coding""","""SSU72""",3,0,0,0
"""-""","""hsa_circ_0000007""","""ENSG00000078369""","""protein_coding""","""GNB1""",6,6,0,2
…,…,…,…,…,…,…,…,…
"""+""",,"""ENSG00000182150""","""protein_coding""","""ERCC6L2""",0,0,0,0
"""-""",,"""ENSG00000136861""","""protein_coding""","""CDK5RAP2""",0,0,0,0
"""+""",,"""ENSG00000107164""","""protein_coding""","""FUBP3""",0,0,0,0
"""-""",,"""ENSG00000160271""","""protein_coding""","""RALGDS""",0,0,0,0


# load te counts

# load pirna

In [None]:
# get common samples for all dataframes