# **Processing Files of Interest from TCGA-BRCA**
TCGA: The Cancer Genome Atlas

# Importing Libraries

In [1]:
import os

import pandas as pd

# Paths

In [2]:
# Data folder path
DATA_FOLDER = '../../data'

# External data folder path
EXTERNAL_DATA_PATH = f'{DATA_FOLDER}/external/tcga-brca'

# Basal-like files folder path
BASAL_LIKE_PATH = f'{EXTERNAL_DATA_PATH}/basal-like-files'

# HER2-enriched files folder path
HER2_ENRICHED_PATH = f'{EXTERNAL_DATA_PATH}/her2-enriched-files'

# Luminal A files folder path
LUMINAL_A_PATH = f'{EXTERNAL_DATA_PATH}/luminal-a-files'

# Luminal B files folder path
LUMINAL_B_PATH = f'{EXTERNAL_DATA_PATH}/luminal-b-files'

# Normal tissue files folder path
NORMAL_TISSUE_PATH = f'{EXTERNAL_DATA_PATH}/normal-tissue-files'

# Processed data folder path
PROCESSED_DATA_PATH = f'{DATA_FOLDER}/processed'

# Functions

In [3]:
def mir_files_processing(path):
    # List the miRNA-Seq files contained in the folder
    files = [f for f in os.listdir(path) if f.startswith('mirna-seq_')]

    # Retrieve the file id of the first file
    file_id = files[0].replace('mirna-seq_', '')
    file_id = file_id.replace('.txt', '')

    # Initialize the DataFrame with the read count of the first file
    df_mir_reads = pd.read_csv(f'{path}/{files[0]}', sep='\t')
    df_mir_reads = df_mir_reads \
        [['miRNA_ID', 'read_count']] \
        .rename(columns={'read_count': file_id})

    # Transform each file read count into a new column of the DataFrame
    for file in files[1:]:
        # Retrieve the file id
        file_id = file.replace('mirna-seq_', '')
        file_id = file_id.replace('.txt', '')

        # Read the file and rename the read count column
        df_temp = pd.read_csv(f'{path}/{file}', sep='\t')
        df_temp = df_temp \
            [['miRNA_ID', 'read_count']] \
            .rename(columns={'read_count': file_id})

        # Add the file read count as a new DataFrame column
        df_mir_reads = df_mir_reads \
            .merge(
                right=df_temp,
                left_on='miRNA_ID',
                right_on='miRNA_ID',
                how='outer'
            )
    
    # Rename the microRNA id column
    df_mir_reads = df_mir_reads.rename(columns={'miRNA_ID': 'mirna'})
    
    return df_mir_reads

In [4]:
def rna_files_processing(path):
    # List the RNA-Seq files contained in the folder
    files = [f for f in os.listdir(path) if f.startswith('rna-seq_')]

    # Retrieve the file id of the first file
    file_id = files[0].replace('rna-seq_', '')
    file_id = file_id.replace('.tsv', '')

    # Initialize the DataFrame with the read count of the first file
    df_gene_reads = pd.read_csv(f'{path}/{files[0]}', sep='\t', skiprows=1)
    df_gene_reads = df_gene_reads \
        .query('gene_type == "protein_coding"') \
        [['gene_name', 'unstranded']] \
        .drop_duplicates(subset=['gene_name'], keep='first') \
        .rename(columns={'unstranded': file_id}) \
        .reset_index(drop=True)
        
    # Transform each file read count into a new column of the DataFrame
    for file in files[1:]:
        # Retrieve the file id
        file_id = file.replace('rna-seq_', '')
        file_id = file_id.replace('.tsv', '')

        # Read the file and rename the read count column
        df_temp = pd.read_csv(f'{path}/{file}', sep='\t', skiprows=1)
        df_temp = df_temp \
            .query('gene_type == "protein_coding"') \
            [['gene_name', 'unstranded']] \
            .drop_duplicates(subset=['gene_name'], keep='first') \
            .rename(columns={'unstranded': file_id})

        # Add the file read count as a new DataFrame column
        df_gene_reads = df_gene_reads \
            .merge(
                right=df_temp,
                left_on='gene_name',
                right_on='gene_name',
                how='inner'
            )
        
    # Rename the gene name column
    df_gene_reads = df_gene_reads.rename(columns={'gene_name': 'gene'})
    
    return df_gene_reads

In [5]:
def files_processing(files_path, files_prefix):
    # Process the miRNA-Seq files related to the cohort of interest
    df_mir_reads = mir_files_processing(files_path)

    # Store the DataFrames of the processed miRNA-Seq files into CSV files
    file_name = f'{files_prefix}-mirna-reads.csv'
    df_mir_reads.to_csv(f'{PROCESSED_DATA_PATH}/{file_name}', index=False)
    
    # Process the RNA-Seq files related to the cohort of interest
    df_rna_reads = rna_files_processing(files_path)

    # Store the DataFrames of the processed RNA-Seq files into CSV files
    file_name = f'{files_prefix}-rna-reads.csv'
    df_rna_reads.to_csv(f'{PROCESSED_DATA_PATH}/{file_name}', index=False)
    
    return df_mir_reads, df_rna_reads

# Tumor Tissue Analysis Files

## Basal-like

In [6]:
# Process Basal-like related miRNA-Seq and RNA-Seq files
df_mir_basal_reads, df_rna_basal_reads = files_processing(
    files_path=BASAL_LIKE_PATH, files_prefix='basal-like'
)

In [7]:
# Print the DataFrame of processed Basal-like microRNA reads
df_mir_basal_reads

Unnamed: 0,mirna,1d47e720-1a02-45f4-b0dc-99861916e3e1,e1271843-3bd3-4f0b-8e6f-fc763ad65776,8b18c127-13dc-4ca5-b3e8-38ccbd21b010,9adf1983-85b9-4ef6-aff4-dea772ff91a2,3cdba1e4-b7d4-47ea-8cae-68a8262515d4,6b1c475e-b897-457e-a6b7-3aad397202b4,b5e2d74c-1626-42de-b97c-4e988e44f9f5,5221cacc-3d04-4a17-8822-6fd9b4bfb2a4,d3ed4500-368c-42a3-998b-7aca451db233,...,e68bb5da-1d15-4de9-b8d4-eac78a205699,dda41694-ea43-4044-a1a6-7c445734c36c,9e73d394-b8e3-4ee2-bb35-a1fb43fd280f,73a1ad6d-17fa-4f62-9275-8c6a5c0c1f2d,de750a3e-5286-44ec-a701-bcaa7a06e569,0b9c4345-bf3a-4de7-a022-4039f93d7b27,3e4b3c1f-cbb8-487f-9c9e-01291970ade3,aa4061d1-8b1e-4aed-b0d7-fe3a5466737e,b5604d13-4cde-44fa-be67-203bc5da0caf,788a05e5-5fee-4430-b303-d4bc30649158
0,hsa-let-7a-1,12807,6625,14050,9217,9647,37461,10941,6618,8235,...,24466,7942,9577,10976,188207,14831,27985,29594,20308,6605
1,hsa-let-7a-2,12718,6502,13976,9389,9306,37071,10795,6642,8117,...,24295,7672,9612,11292,187398,14650,27512,29517,20127,6589
2,hsa-let-7a-3,13064,6864,14274,9587,9377,37213,10973,6640,8176,...,24856,7765,9694,11200,190402,14915,27972,29681,20577,6741
3,hsa-let-7b,26120,33714,26191,70636,31930,106137,20726,12539,17189,...,102860,39327,11352,40680,527829,50167,62915,61024,50021,23325
4,hsa-let-7c,4490,708,1529,2697,6327,1170,7983,4253,3670,...,9166,3764,1373,4167,13574,1290,12453,7260,9049,832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,hsa-mir-9500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1877,hsa-mir-96,178,69,22,7,43,73,111,29,93,...,115,10,46,36,227,33,129,50,96,117
1878,hsa-mir-98,234,61,96,110,92,139,126,117,88,...,301,137,59,21,1192,192,209,214,304,195
1879,hsa-mir-99a,1278,212,474,428,2808,267,829,871,730,...,3021,1284,158,643,4145,288,3581,3445,2070,373


In [8]:
# Print the DataFrame of processed Basal-like gene reads
df_rna_basal_reads

Unnamed: 0,gene,c7aeba7e-b78e-4586-a4dd-8a04cc440737,5fbe4a11-de4d-418e-9bc3-75b768d4a665,2cd221ee-4d29-4f40-ab4c-780cc9045c5a,77f150b1-5f40-442d-91c5-6d3571513513,59858555-bc6a-4286-8280-0f8341123cac,08b0fe4c-ee3c-4510-b75a-6d240cb038cc,4f464ced-080f-41f1-9a00-955db0d5fda6,1f5bbb4e-9b0e-4953-a360-83fd7b6a2267,f2a4b38d-bfc0-4363-9a58-b5fa2af26ff3,...,27f4c9d4-d6ba-4fea-9f0b-fb06066e69a8,c95a37a1-d3cd-421c-bc5f-0f8dc448f64c,c6fd0f95-d74a-49e3-9a70-fec12e63ff1a,511ca25f-0b7f-4912-9b79-ed551721a420,6e112ec6-1791-4764-8a62-7ad4dfea2d3b,8a7fe670-bcef-4fcb-9759-9892ff5f4f61,7789d241-991c-4cc9-a557-69d5c439b930,1d15f05d-2ac2-4b83-987e-6f4b157b0b74,fe7dfda5-6846-4238-9ccc-472978eb78a1,ff1f2f31-8607-4627-b487-c5e39e7c30f5
0,TSPAN6,3600,4506,1065,6342,9201,1494,8658,3209,4679,...,8037,3875,2581,2300,2162,6421,6170,3345,5469,2127
1,TNMD,77,1,254,5,9,78,13,69,2,...,1,5,5,7,29,140,48,23,9,0
2,DPM1,2078,3746,1757,1913,4186,3955,6747,2785,2570,...,2814,870,2766,1418,3375,2061,2776,1792,2153,742
3,SCYL3,2576,1111,1472,640,2054,812,2791,1361,1342,...,1920,649,2002,674,2340,1620,1152,955,985,911
4,C1orf112,948,1404,262,665,4333,404,2377,1113,1121,...,795,257,799,691,1633,1324,968,576,1452,328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19933,AL451106.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19934,AC008763.4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
19935,AC006486.3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19936,AL391628.1,17,7,9,2,11,8,12,6,14,...,6,6,5,8,13,5,5,7,6,1


## HER2-enriched

In [9]:
# Process HER2-enriched related miRNA-Seq and RNA-Seq files
df_mir_her2_reads, df_rna_her2_reads = files_processing(
    files_path=HER2_ENRICHED_PATH, files_prefix='her2-enriched'
)

In [10]:
# Print the DataFrame of processed HER2-enriched microRNA reads
df_mir_her2_reads

Unnamed: 0,mirna,71f38728-5265-4317-a9da-95129726fe2b,74c57e6a-badb-4c90-817f-f265e9f0eae7,cba39ceb-4b7e-49cb-8b01-c44b7e70ad25,0b85120a-29ec-4948-9eb3-dbea5709cb31,2bff15a5-c2ca-4529-ba3a-3042c0de59f3,b789f2c1-a793-41c3-80b1-781e3fe6dbca,b2c1efd2-e611-4dc8-bde7-cb6c2286fdaf,e48a4127-695e-47d7-8949-db5e12abe3ff,b46f17b8-62c8-4882-9978-d9c4d1b02d3a,...,db7aa78b-4bb9-4d89-9219-bc93eaa277a2,3ef75f19-8b1a-4d77-a84a-f89e9869c0bb,94daa651-5d59-4a69-bfd0-b95e146cee04,2e1de344-442c-4c2b-929a-74021d92c95b,7e4b904e-cacd-4c36-a5ad-0923bec03578,428c71b7-7fa6-4b34-b85c-bc67329db834,7843d077-04e2-41d2-bd64-73edd83e9c1f,6f335026-6df0-43f8-8076-fb6534e4420f,fb898446-9ba2-47d5-b86f-41a424c6fc38,f9b88865-6e80-46ed-98aa-6d81ba5d4a02
0,hsa-let-7a-1,27094,18310,11859,3668,18170,19743,113083,14565,3719,...,8754,13668,8326,9222,61535,6764,8536,19761,32121,9030
1,hsa-let-7a-2,26983,18167,11828,3766,18091,19849,112856,14602,3589,...,8680,13478,8227,9384,61358,6702,8463,19810,32285,9068
2,hsa-let-7a-3,27129,18306,11888,3668,18655,19857,113804,14436,3682,...,8921,14052,8243,9285,61959,6653,8606,19870,32494,9148
3,hsa-let-7b,50858,58978,60658,9934,63291,54597,274958,55401,6204,...,33278,68570,20762,12030,239778,38555,19621,20612,124448,38335
4,hsa-let-7c,5546,3698,1702,1701,1087,9970,19514,3162,618,...,2762,1909,2758,2885,11631,2198,1950,3410,4310,3724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,hsa-mir-9500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1877,hsa-mir-96,260,61,53,42,104,65,150,61,63,...,23,188,47,22,151,29,28,79,122,28
1878,hsa-mir-98,237,192,125,54,382,248,494,59,23,...,130,56,276,75,223,36,46,56,121,74
1879,hsa-mir-99a,1298,1440,340,402,280,4201,6730,1021,203,...,787,798,385,959,3700,522,866,851,987,1068


In [11]:
# Print the DataFrame of processed HER2-enriched gene reads
df_rna_her2_reads

Unnamed: 0,gene,eb166054-ff70-4a86-883b-9c25a7d2b0e5,d3badb09-df1e-488e-ade9-97f1925b5649,788a4858-c6c6-4a8d-9aef-dbd32d74776b,0a74ea3f-dadc-4c9f-96ea-62a77c1e602c,1936440a-9cfc-4b04-af40-3fbc22fe87f6,cee943a8-ea26-4b2f-b8a4-66300ccacfb5,0710056c-2f04-4182-90ca-45492dd6444c,5c0579f7-6184-4afa-805c-7aeec4a4c5d2,be6f8b24-e668-41c6-89f7-f4cdc8533cae,...,befa225f-9fe9-412f-a503-449874cf52d8,27aab1a6-96af-4dc1-b9ce-4690ead0d355,0c279a15-1250-4923-bcde-96cc3fd32d42,a4589532-53e0-4025-b8d3-e11e79e1fc9c,fd371070-c312-4eca-b3da-d41b1c8a86a7,7b188e2a-4a56-49f9-b527-cd61a646f6c7,f0a63361-78bb-4a7f-9d78-31f7b2980ba2,09a491b5-1bfb-47d9-963f-69f6c7e6c1e5,111b2865-e7dc-45c9-b206-20b5713714bc,0a7421d4-5722-427a-9643-da1074e5c25c
0,TSPAN6,2825,1412,3926,6278,3606,1349,2951,1379,585,...,539,1052,2700,1075,11016,8421,745,3834,735,1637
1,TNMD,19,0,8,18,2,0,5,0,6,...,1,12,0,0,2,0,0,0,0,0
2,DPM1,9021,3381,1212,2633,1816,878,4550,2176,793,...,5144,2201,2527,2191,3511,3904,1537,3161,1329,2061
3,SCYL3,3057,1237,856,1425,2893,378,3296,723,620,...,980,861,1093,2598,2828,2519,1798,1411,755,1944
4,C1orf112,530,1087,477,480,447,243,1247,383,184,...,423,546,575,381,3257,1186,497,637,597,656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19933,AL451106.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19934,AC008763.4,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19935,AC006486.3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19936,AL391628.1,10,6,4,9,0,0,31,0,0,...,2,5,3,4,2,7,5,0,3,1


## Luminal A

In [12]:
# Process Luminal A related miRNA-Seq and RNA-Seq files
df_mir_luma_reads, df_rna_luma_reads = files_processing(
    files_path=LUMINAL_A_PATH, files_prefix='luminal-a'
)

In [13]:
# Print the DataFrame of processed Luminal A microRNA reads
df_mir_luma_reads

Unnamed: 0,mirna,25e9d1e2-2fc3-4ee0-bf29-421adfcaf781,04e89bf1-f3f3-40dd-ab4e-298425cafb91,baf45d0a-2691-4988-a33e-8ee88bcbdbb6,32c24548-2714-4a90-bfb1-002f1c46c0c5,a3337105-df27-4bab-8051-3cfd830e35dc,2c4770c9-01db-4f5e-b392-fca2d1c0ddcb,4e515787-2cff-421d-8c77-f7bc35632c46,d5da7109-ec9b-45b4-ae60-1c3bf898c0e3,df44fccd-11dc-4fed-b2b9-74ff4081405e,...,f7d07de5-5429-446e-8aec-e7e2c4cb4c6d,d4976851-cef0-47d5-9ea6-8c72ece9ed7a,5d2a2e56-1301-4c5c-a26b-e8572281a3c3,2dc58f9b-608f-4df7-8459-6a0e84e3e893,6b261846-2cc0-4c86-b714-7429e18684d9,540f5a9f-330b-4817-baa6-490b83237292,f16e1b44-07c0-4621-9750-d63d13044c42,48e35d76-273a-4dd7-ad6a-cd740e109921,2afb9195-37c8-4e8c-9b39-53a2eefa6d31,e278cd4c-5cbb-42f7-9fc9-cc2c94a2bd4e
0,hsa-let-7a-1,32862,16800,27072,25552,93937,127722,23507,105043,18252,...,23850,27023,14279,13308,29538,53255,11507,132628,27583,8883
1,hsa-let-7a-2,32423,16826,26741,25591,92867,127771,23548,104866,18365,...,23972,26691,14257,13203,29169,52976,11420,132784,27434,8773
2,hsa-let-7a-3,32782,16939,27170,25844,93599,128456,23664,104874,18398,...,23999,27094,14498,13538,29648,53722,11389,133211,27900,9005
3,hsa-let-7b,149422,91825,102645,46093,374939,317710,128421,57339,102483,...,303316,81623,34807,62573,34622,102564,28243,562505,184807,40960
4,hsa-let-7c,303,5048,7572,1672,2662,35368,7735,7048,3153,...,8001,6303,3664,3850,8624,12621,3540,9214,7345,2788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,hsa-mir-9500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1877,hsa-mir-96,165,57,40,116,160,130,47,210,7,...,34,48,46,186,49,57,102,40,154,18
1878,hsa-mir-98,41,185,88,102,151,390,104,64,69,...,60,102,65,64,149,198,273,265,67,110
1879,hsa-mir-99a,38,1222,1122,591,969,13127,2937,2056,871,...,1700,1526,755,966,2563,3303,520,1341,2564,629


In [14]:
# Print the DataFrame of processed Luminal A gene reads
df_rna_luma_reads

Unnamed: 0,gene,e5b0c2dc-c652-40a0-bb80-d7e87830b406,8d87fbd5-8ca7-4ff6-b8b1-79f098dbca9f,bc0a4326-63d2-4ec4-9af6-7b421aa8aa49,03d18286-1038-4c0c-9c05-d1269e280250,bf26a3fb-39a8-4eff-b801-a7dd915e73f5,81f86e31-55db-4483-b6e0-55451ffbd1db,d914654b-ec4f-4be5-a525-2997a4b39279,072cae2e-f4c1-4a3f-85e1-6f7e35a65108,4c75e72b-db9f-425b-9f14-946ee801071e,...,6df72103-ccfb-4a53-b839-2e8c47ec2145,a03952ab-2ac1-43a5-a438-1f022bcd0b16,03891509-3109-450d-8564-77b024a6128e,65bc2a34-2355-4193-b20e-439f2d7f6df2,c64a0adb-56a8-4966-9dd9-1acd789c1bb3,892928f7-8239-4d15-812b-046f9192d7de,196aed5b-6812-47c5-ad7b-4dd1743ed0b5,987d401a-3e68-414e-8067-afe277b02fad,1a4285b8-765b-4954-b9f2-dc8609572889,8f79405a-c78e-4a9e-a673-969658a0f90b
0,TSPAN6,2898,2969,1236,1825,5055,1890,1134,3498,787,...,7561,1472,655,3324,1847,4278,2177,5269,2436,2610
1,TNMD,27,760,4,63,6,5,20,1,114,...,4,8,41,32,86,4,33,57,59,1
2,DPM1,1312,1581,2412,1737,1849,6022,1312,2713,1989,...,2978,1950,2275,1658,1124,2609,1502,1896,1254,1406
3,SCYL3,2001,1535,1480,1596,1694,1725,964,3231,2530,...,1914,2134,3068,961,718,1005,1027,1262,2138,880
4,C1orf112,767,423,328,483,393,728,467,905,764,...,618,863,800,702,298,495,457,334,931,243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19933,AL451106.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19934,AC008763.4,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19935,AC006486.3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19936,AL391628.1,10,11,8,7,4,7,8,3,13,...,4,8,8,9,6,5,5,3,21,0


## Luminal B

In [15]:
# Process Luminal B related miRNA-Seq and RNA-Seq files
df_mir_lumb_reads, df_rna_lumb_reads = files_processing(
    files_path=LUMINAL_B_PATH, files_prefix='luminal-b'
)

In [16]:
# Print the DataFrame of processed Luminal A microRNA reads
df_mir_luma_reads

Unnamed: 0,mirna,25e9d1e2-2fc3-4ee0-bf29-421adfcaf781,04e89bf1-f3f3-40dd-ab4e-298425cafb91,baf45d0a-2691-4988-a33e-8ee88bcbdbb6,32c24548-2714-4a90-bfb1-002f1c46c0c5,a3337105-df27-4bab-8051-3cfd830e35dc,2c4770c9-01db-4f5e-b392-fca2d1c0ddcb,4e515787-2cff-421d-8c77-f7bc35632c46,d5da7109-ec9b-45b4-ae60-1c3bf898c0e3,df44fccd-11dc-4fed-b2b9-74ff4081405e,...,f7d07de5-5429-446e-8aec-e7e2c4cb4c6d,d4976851-cef0-47d5-9ea6-8c72ece9ed7a,5d2a2e56-1301-4c5c-a26b-e8572281a3c3,2dc58f9b-608f-4df7-8459-6a0e84e3e893,6b261846-2cc0-4c86-b714-7429e18684d9,540f5a9f-330b-4817-baa6-490b83237292,f16e1b44-07c0-4621-9750-d63d13044c42,48e35d76-273a-4dd7-ad6a-cd740e109921,2afb9195-37c8-4e8c-9b39-53a2eefa6d31,e278cd4c-5cbb-42f7-9fc9-cc2c94a2bd4e
0,hsa-let-7a-1,32862,16800,27072,25552,93937,127722,23507,105043,18252,...,23850,27023,14279,13308,29538,53255,11507,132628,27583,8883
1,hsa-let-7a-2,32423,16826,26741,25591,92867,127771,23548,104866,18365,...,23972,26691,14257,13203,29169,52976,11420,132784,27434,8773
2,hsa-let-7a-3,32782,16939,27170,25844,93599,128456,23664,104874,18398,...,23999,27094,14498,13538,29648,53722,11389,133211,27900,9005
3,hsa-let-7b,149422,91825,102645,46093,374939,317710,128421,57339,102483,...,303316,81623,34807,62573,34622,102564,28243,562505,184807,40960
4,hsa-let-7c,303,5048,7572,1672,2662,35368,7735,7048,3153,...,8001,6303,3664,3850,8624,12621,3540,9214,7345,2788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,hsa-mir-9500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1877,hsa-mir-96,165,57,40,116,160,130,47,210,7,...,34,48,46,186,49,57,102,40,154,18
1878,hsa-mir-98,41,185,88,102,151,390,104,64,69,...,60,102,65,64,149,198,273,265,67,110
1879,hsa-mir-99a,38,1222,1122,591,969,13127,2937,2056,871,...,1700,1526,755,966,2563,3303,520,1341,2564,629


In [17]:
# Print the DataFrame of processed Luminal B gene reads
df_rna_lumb_reads

Unnamed: 0,gene,dc8b305a-6f43-410e-9ce6-96ac24827550,2c3000b7-4db9-4f00-a82a-ca6802806631,e4086c26-d200-4e42-8249-ed8cbeec0951,76267112-851f-4f5b-af40-f3c90af1b2ce,5a827399-307b-412b-a2ce-f6c81d2750a6,a8ab2eb8-9da3-494f-b740-dc3f4185acb2,ee89e01e-d54e-4166-a3ae-db9357639523,a127fb1e-aa3b-4b6d-98b5-c141ffba9ae7,84ab5edd-38bf-4987-b9af-fa4d1cbdef2c,...,c7571e35-d912-4935-b2c3-bbba019fe9f8,9765934d-d954-485c-abfe-b9db59f18193,da333184-1ddd-4cd7-9c0d-e4526a03a1c9,318c3a49-64bc-4704-8739-d4cf31cef51f,c1778b3c-5649-4e26-b937-fd00ffd58387,d393af6b-de4b-4ca6-bc06-a29068dfe086,f502949f-ca10-4617-a7ee-b8f142f7ec93,0a688b9b-06ea-4b83-bfce-9cee0d866a0c,4c60d168-b545-46a8-8c37-81972a537a83,3eadce70-ba3d-4806-b112-6ac83bc89dc4
0,TSPAN6,3372,2304,4271,1355,2947,1721,3148,1775,2343,...,1954,2973,1620,83,3427,146,1340,2519,3513,3092
1,TNMD,24,7,92,1,13,78,12,7,2,...,16,4,7,1,12,0,13,0,2,4
2,DPM1,2335,4979,2343,2185,3136,2338,2303,1819,1699,...,2123,3545,2535,2163,3875,1360,1757,1233,7502,3044
3,SCYL3,1675,1544,3467,1005,1295,2104,1860,1789,1092,...,1802,1675,2585,626,2473,921,1132,1522,2786,1284
4,C1orf112,1693,1723,2193,559,860,693,637,911,664,...,694,1188,1144,491,1156,687,598,455,908,712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19933,AL451106.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19934,AC008763.4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19935,AC006486.3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19936,AL391628.1,8,7,15,11,7,11,1,9,8,...,13,7,23,8,6,1,8,0,8,9


# Normal Tissue Analysis Files

In [18]:
# Process normal tissue related miRNA-Seq and RNA-Seq files
df_mir_normal_reads, df_rna_normal_reads = files_processing(
    files_path=NORMAL_TISSUE_PATH, files_prefix='normal-tissue'
)

In [19]:
# Print the DataFrame of processed normal tissue microRNA reads
df_mir_normal_reads

Unnamed: 0,mirna,2c658173-1941-45fe-baf5-ab67dfbcdfff,9c6e6db1-4096-47e2-9b77-abd2b4c7d180,ed3fd3e3-1b78-4a4f-8419-f4111c8c5c73,70c76e2f-3ce4-4864-8935-eaa51605612a,c1c6a9a5-73f2-414a-896d-4d3e564f6e59,133ae78e-fd83-4a13-9de6-28dbbe8e37c7,1540ed84-19b0-49d5-a728-3b04a29abe8a,5f2f42c5-e9d1-4a47-86a7-fd19308eca52,f1394f51-c3ab-45e8-87a2-fe641f00518e,...,5608dff0-72ab-468c-b25f-d411ad581052,6d88ce71-ecd1-4edf-8935-ea9137ed1ef1,a416945e-cd2d-43f6-aeff-d35cb66684df,6f2d4c85-893f-40ed-8ec1-a03b53ec59c1,641c7068-0f33-42dd-a4fe-cafb541826ee,dc731794-8d31-4fb8-a4a5-c894e788fa0b,28a15efb-f538-41fd-9517-0fb86cae338b,79ab1074-d958-4de9-98a6-934277350b07,6fef6e45-aa6e-41e6-98b1-5dbb9a67b31f,f2c479c0-f730-4751-a83c-8be312ea9f04
0,hsa-let-7a-1,30679,13371,46234,54834,43565,67039,54257,93179,35570,...,27032,15784,28515,65817,32214,29908,36349,19575,139569,26134
1,hsa-let-7a-2,30438,13341,46323,55455,43403,67213,54744,92741,35437,...,26846,15776,28825,65745,32014,29774,36384,19294,139951,26279
2,hsa-let-7a-3,30952,13395,45914,55384,44242,67055,55130,92957,35726,...,27388,15781,28729,65547,32448,30292,36816,19574,139472,26431
3,hsa-let-7b,171173,63077,146223,71425,96955,135813,75328,164726,93741,...,65008,76337,65819,138245,79689,161965,53413,109858,260188,83076
4,hsa-let-7c,31981,10632,29352,24301,21725,35164,21758,47779,24437,...,2189,15812,19732,37239,20069,30527,15435,20838,54744,13508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,hsa-mir-9500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1877,hsa-mir-96,17,2,26,9,17,22,4,11,24,...,53,7,8,10,7,18,9,7,18,10
1878,hsa-mir-98,76,32,103,245,177,168,126,144,112,...,66,56,97,147,84,97,207,49,163,70
1879,hsa-mir-99a,9813,10410,12587,4943,5457,14149,6742,15834,6966,...,621,7781,5328,12485,4982,10254,3944,7357,23157,5367


In [20]:
# Print the DataFrame of processed normal tissue gene reads
df_rna_normal_reads

Unnamed: 0,gene,cbfb8ffe-ae83-4a16-aaa6-f21ea893cb8d,baec6a46-7c48-41ed-a8a9-eef52d32cba3,ea1fadc2-1cdc-4658-9619-eeb26ae09da8,8ebe0bf6-11fa-418d-918c-5c73f0e7e9ac,b70b68a1-28c4-4ed3-a04c-b622e583f10b,4df2233b-3bbc-4d20-9abc-2a09b3f37383,a37587aa-2e1f-42f5-a691-c4a41ae79ea9,3071e512-94ea-4820-9573-668235188e34,8a84b9a5-d453-416b-b481-f15402c2eb54,...,fd94a0ed-37d8-49a5-af96-e2160a9e6096,b88daf3f-645d-4d11-a2be-2295e186747f,38854c85-fc09-4a51-93a7-257762517583,736fca14-66fb-481a-985e-7253f75243a8,d7a48283-c113-4745-be6b-553966e6b457,15f0e499-8d53-4e97-a392-334875d25cf4,3aad7b0b-9f82-41d0-b3e5-5614afcac6a8,23bf74db-bb4e-44c5-8473-e651b818e460,1320db11-22a5-417f-8ec7-65c0bf4681a2,68881256-49b8-4a19-87a9-afac4f1841d2
0,TSPAN6,6447,3607,3791,6854,3919,5384,2849,3820,2829,...,4472,3260,8064,6679,2407,5784,5005,5383,4435,4158
1,TNMD,3496,3612,228,318,46,712,160,105,240,...,692,401,1408,569,303,209,235,319,653,75
2,DPM1,3832,1246,2184,3435,1242,1500,1177,1705,1067,...,1428,1505,2647,2240,1134,1861,1762,2465,1705,1456
3,SCYL3,1002,670,1515,2107,982,1636,1187,1678,928,...,1107,1049,2368,1980,560,2197,2052,1191,1094,1446
4,C1orf112,370,202,294,698,222,338,244,292,255,...,218,263,590,458,216,411,361,272,282,299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19933,AL451106.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19934,AC008763.4,2,2,0,0,0,1,0,0,0,...,1,0,2,0,0,0,0,6,0,0
19935,AC006486.3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19936,AL391628.1,12,18,19,9,4,13,9,3,12,...,16,6,10,8,26,4,5,5,4,6
