# **Processing Files of Interest from TCGA-BRCA**
TCGA: The Cancer Genome Atlas

# Importing Libraries

In [1]:
import os

import pandas as pd

# Paths

In [2]:
# Data folder path
DATA_FOLDER = '../../data'

# External data folder path
EXTERNAL_DATA_PATH = f'{DATA_FOLDER}/external/tcga-brca'

# Basal-like files folder path
BASAL_LIKE_PATH = f'{EXTERNAL_DATA_PATH}/basal-like-files'

# HER2-enriched files folder path
HER2_ENRICHED_PATH = f'{EXTERNAL_DATA_PATH}/her2-enriched-files'

# Luminal A files folder path
LUMINAL_A_PATH = f'{EXTERNAL_DATA_PATH}/luminal-a-files'

# Luminal B files folder path
LUMINAL_B_PATH = f'{EXTERNAL_DATA_PATH}/luminal-b-files'

# Normal tissue files folder path
NORMAL_TISSUE_PATH = f'{EXTERNAL_DATA_PATH}/normal-tissue-files'

# Processed data folder path
PROCESSED_DATA_PATH = f'{DATA_FOLDER}/processed'

# Functions

In [3]:
def mir_files_processing(path):
    # List the miRNA-Seq files contained in the folder
    files = [f for f in os.listdir(path) if f.startswith('mirna-seq_')]

    # Retrieve the file id of the first file
    file_id = files[0].replace('mirna-seq_', '')
    file_id = file_id.replace('.txt', '')

    # Initialize the DataFrame with the read count of the first file
    df_mir_reads = pd.read_csv(f'{path}/{files[0]}', sep='\t')
    df_mir_reads = df_mir_reads \
        [['miRNA_ID', 'read_count']] \
        .rename(columns={'read_count': file_id})

    # Transform each file read count into a new column of the DataFrame
    for file in files[1:]:
        # Retrieve the file id
        file_id = file.replace('mirna-seq_', '')
        file_id = file_id.replace('.txt', '')

        # Read the file and rename the read count column
        df_temp = pd.read_csv(f'{path}/{file}', sep='\t')
        df_temp = df_temp \
            [['miRNA_ID', 'read_count']] \
            .rename(columns={'read_count': file_id})

        # Add the file read count as a new DataFrame column
        df_mir_reads = df_mir_reads \
            .merge(
                right=df_temp,
                left_on='miRNA_ID',
                right_on='miRNA_ID',
                how='outer'
            )
    
    # Rename the microRNA id column
    df_mir_reads = df_mir_reads.rename(columns={'miRNA_ID': 'mirna'})
    
    return df_mir_reads

# Tumor Tissue Analysis Files

## Basal-like

In [4]:
# Process Basal-like related miRNA-Seq files
df_mir_basal_like = mir_files_processing(BASAL_LIKE_PATH)

# Store the DataFrame of processed Basal-like miRNA-Seq files in a CSV file
file_name = 'basal-like-mirna-reads.csv'
df_mir_basal_like.to_csv(f'{PROCESSED_DATA_PATH}/{file_name}', index=False)

In [5]:
# Print the DataFrame of processed Basal-like miRNA-Seq files
df_mir_basal_like

Unnamed: 0,mirna,1d47e720-1a02-45f4-b0dc-99861916e3e1,e1271843-3bd3-4f0b-8e6f-fc763ad65776,8b18c127-13dc-4ca5-b3e8-38ccbd21b010,9adf1983-85b9-4ef6-aff4-dea772ff91a2,3cdba1e4-b7d4-47ea-8cae-68a8262515d4,6b1c475e-b897-457e-a6b7-3aad397202b4,b5e2d74c-1626-42de-b97c-4e988e44f9f5,5221cacc-3d04-4a17-8822-6fd9b4bfb2a4,d3ed4500-368c-42a3-998b-7aca451db233,...,e68bb5da-1d15-4de9-b8d4-eac78a205699,dda41694-ea43-4044-a1a6-7c445734c36c,9e73d394-b8e3-4ee2-bb35-a1fb43fd280f,73a1ad6d-17fa-4f62-9275-8c6a5c0c1f2d,de750a3e-5286-44ec-a701-bcaa7a06e569,0b9c4345-bf3a-4de7-a022-4039f93d7b27,3e4b3c1f-cbb8-487f-9c9e-01291970ade3,aa4061d1-8b1e-4aed-b0d7-fe3a5466737e,b5604d13-4cde-44fa-be67-203bc5da0caf,788a05e5-5fee-4430-b303-d4bc30649158
0,hsa-let-7a-1,12807,6625,14050,9217,9647,37461,10941,6618,8235,...,24466,7942,9577,10976,188207,14831,27985,29594,20308,6605
1,hsa-let-7a-2,12718,6502,13976,9389,9306,37071,10795,6642,8117,...,24295,7672,9612,11292,187398,14650,27512,29517,20127,6589
2,hsa-let-7a-3,13064,6864,14274,9587,9377,37213,10973,6640,8176,...,24856,7765,9694,11200,190402,14915,27972,29681,20577,6741
3,hsa-let-7b,26120,33714,26191,70636,31930,106137,20726,12539,17189,...,102860,39327,11352,40680,527829,50167,62915,61024,50021,23325
4,hsa-let-7c,4490,708,1529,2697,6327,1170,7983,4253,3670,...,9166,3764,1373,4167,13574,1290,12453,7260,9049,832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,hsa-mir-9500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1877,hsa-mir-96,178,69,22,7,43,73,111,29,93,...,115,10,46,36,227,33,129,50,96,117
1878,hsa-mir-98,234,61,96,110,92,139,126,117,88,...,301,137,59,21,1192,192,209,214,304,195
1879,hsa-mir-99a,1278,212,474,428,2808,267,829,871,730,...,3021,1284,158,643,4145,288,3581,3445,2070,373


## HER2-enriched

In [6]:
# Process HER2-enriched related miRNA-Seq files
df_mir_her2_enriched = mir_files_processing(HER2_ENRICHED_PATH)

# Store the DataFrame of processed HER2-enriched miRNA-Seq files in a CSV file
file_name = 'her2-enriched-mirna-reads.csv'
df_mir_her2_enriched.to_csv(f'{PROCESSED_DATA_PATH}/{file_name}', index=False)

In [7]:
# Print the DataFrame of processed HER2-enriched miRNA-Seq files
df_mir_her2_enriched

Unnamed: 0,mirna,71f38728-5265-4317-a9da-95129726fe2b,74c57e6a-badb-4c90-817f-f265e9f0eae7,cba39ceb-4b7e-49cb-8b01-c44b7e70ad25,0b85120a-29ec-4948-9eb3-dbea5709cb31,2bff15a5-c2ca-4529-ba3a-3042c0de59f3,b789f2c1-a793-41c3-80b1-781e3fe6dbca,b2c1efd2-e611-4dc8-bde7-cb6c2286fdaf,e48a4127-695e-47d7-8949-db5e12abe3ff,b46f17b8-62c8-4882-9978-d9c4d1b02d3a,...,db7aa78b-4bb9-4d89-9219-bc93eaa277a2,3ef75f19-8b1a-4d77-a84a-f89e9869c0bb,94daa651-5d59-4a69-bfd0-b95e146cee04,2e1de344-442c-4c2b-929a-74021d92c95b,7e4b904e-cacd-4c36-a5ad-0923bec03578,428c71b7-7fa6-4b34-b85c-bc67329db834,7843d077-04e2-41d2-bd64-73edd83e9c1f,6f335026-6df0-43f8-8076-fb6534e4420f,fb898446-9ba2-47d5-b86f-41a424c6fc38,f9b88865-6e80-46ed-98aa-6d81ba5d4a02
0,hsa-let-7a-1,27094,18310,11859,3668,18170,19743,113083,14565,3719,...,8754,13668,8326,9222,61535,6764,8536,19761,32121,9030
1,hsa-let-7a-2,26983,18167,11828,3766,18091,19849,112856,14602,3589,...,8680,13478,8227,9384,61358,6702,8463,19810,32285,9068
2,hsa-let-7a-3,27129,18306,11888,3668,18655,19857,113804,14436,3682,...,8921,14052,8243,9285,61959,6653,8606,19870,32494,9148
3,hsa-let-7b,50858,58978,60658,9934,63291,54597,274958,55401,6204,...,33278,68570,20762,12030,239778,38555,19621,20612,124448,38335
4,hsa-let-7c,5546,3698,1702,1701,1087,9970,19514,3162,618,...,2762,1909,2758,2885,11631,2198,1950,3410,4310,3724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,hsa-mir-9500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1877,hsa-mir-96,260,61,53,42,104,65,150,61,63,...,23,188,47,22,151,29,28,79,122,28
1878,hsa-mir-98,237,192,125,54,382,248,494,59,23,...,130,56,276,75,223,36,46,56,121,74
1879,hsa-mir-99a,1298,1440,340,402,280,4201,6730,1021,203,...,787,798,385,959,3700,522,866,851,987,1068


## Luminal A

In [8]:
# Process Luminal A related miRNA-Seq files
df_mir_luminal_a = mir_files_processing(LUMINAL_A_PATH)

# Store the DataFrame of processed Luminal A miRNA-Seq files in a CSV file
file_name = 'luminal-a-mirna-reads.csv'
df_mir_luminal_a.to_csv(f'{PROCESSED_DATA_PATH}/{file_name}', index=False)

In [9]:
# Print the DataFrame of processed Luminal A miRNA-Seq files
df_mir_luminal_a

Unnamed: 0,mirna,25e9d1e2-2fc3-4ee0-bf29-421adfcaf781,04e89bf1-f3f3-40dd-ab4e-298425cafb91,baf45d0a-2691-4988-a33e-8ee88bcbdbb6,32c24548-2714-4a90-bfb1-002f1c46c0c5,a3337105-df27-4bab-8051-3cfd830e35dc,2c4770c9-01db-4f5e-b392-fca2d1c0ddcb,4e515787-2cff-421d-8c77-f7bc35632c46,d5da7109-ec9b-45b4-ae60-1c3bf898c0e3,df44fccd-11dc-4fed-b2b9-74ff4081405e,...,f7d07de5-5429-446e-8aec-e7e2c4cb4c6d,d4976851-cef0-47d5-9ea6-8c72ece9ed7a,5d2a2e56-1301-4c5c-a26b-e8572281a3c3,2dc58f9b-608f-4df7-8459-6a0e84e3e893,6b261846-2cc0-4c86-b714-7429e18684d9,540f5a9f-330b-4817-baa6-490b83237292,f16e1b44-07c0-4621-9750-d63d13044c42,48e35d76-273a-4dd7-ad6a-cd740e109921,2afb9195-37c8-4e8c-9b39-53a2eefa6d31,e278cd4c-5cbb-42f7-9fc9-cc2c94a2bd4e
0,hsa-let-7a-1,32862,16800,27072,25552,93937,127722,23507,105043,18252,...,23850,27023,14279,13308,29538,53255,11507,132628,27583,8883
1,hsa-let-7a-2,32423,16826,26741,25591,92867,127771,23548,104866,18365,...,23972,26691,14257,13203,29169,52976,11420,132784,27434,8773
2,hsa-let-7a-3,32782,16939,27170,25844,93599,128456,23664,104874,18398,...,23999,27094,14498,13538,29648,53722,11389,133211,27900,9005
3,hsa-let-7b,149422,91825,102645,46093,374939,317710,128421,57339,102483,...,303316,81623,34807,62573,34622,102564,28243,562505,184807,40960
4,hsa-let-7c,303,5048,7572,1672,2662,35368,7735,7048,3153,...,8001,6303,3664,3850,8624,12621,3540,9214,7345,2788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,hsa-mir-9500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1877,hsa-mir-96,165,57,40,116,160,130,47,210,7,...,34,48,46,186,49,57,102,40,154,18
1878,hsa-mir-98,41,185,88,102,151,390,104,64,69,...,60,102,65,64,149,198,273,265,67,110
1879,hsa-mir-99a,38,1222,1122,591,969,13127,2937,2056,871,...,1700,1526,755,966,2563,3303,520,1341,2564,629


## Luminal B

In [10]:
# Process Luminal B related miRNA-Seq files
df_mir_luminal_b = mir_files_processing(LUMINAL_B_PATH)

# Store the DataFrame of processed Luminal B miRNA-Seq files in a CSV file
file_name = 'luminal-b-mirna-reads.csv'
df_mir_luminal_b.to_csv(f'{PROCESSED_DATA_PATH}/{file_name}', index=False)

In [11]:
# Print the DataFrame of processed Luminal B miRNA-Seq files
df_mir_luminal_b

Unnamed: 0,mirna,4f513085-a7f0-495c-9a87-34f70d757900,a9e1fe98-7705-4751-8565-178d78bfe746,89663744-7c27-4310-a869-03fb9b499df0,a44c536a-f222-44e7-87f4-e63968b2017b,2f047527-d137-4f50-870d-1de05964627c,2a30b4b5-e925-469c-8555-9cc602ea4e1f,05f36336-c79f-45cf-bdde-1acda13d415b,12d172c8-5deb-447b-b651-ed10adff51d6,b1e9cb2b-97ea-46b0-8d25-281843c50161,...,406fd7d3-b73a-4014-b315-1eeaae01e988,e275259d-aa1d-45ab-937c-6263a61a6cc8,5c97f076-c198-4d3d-b680-97c3e9fb08db,52bf7f7d-166f-4207-8737-ea117f3be518,aa94ebe3-13f3-40e6-98ee-4829309d819c,f2ac5078-6abc-4e01-80d1-a0d723fe91af,8c105b54-7d1e-42ba-ba2c-259d2f7e9278,939775c8-4388-4f97-997f-efb92dc93c79,21abffb3-3e07-4022-86ff-51a41036cfc4,00d39f52-dfb1-4256-a6a9-54dea7a0401e
0,hsa-let-7a-1,18607,27227,3499,4353,23638,27123,19673,19524,39154,...,7572,22876,12025,8737,18434,17221,34001,14177,18342,25206
1,hsa-let-7a-2,18589,27407,3561,4295,23592,27485,19643,19496,38890,...,7383,22808,12002,8767,18405,17134,33838,14011,18203,24819
2,hsa-let-7a-3,18661,27734,3597,4384,23729,27508,19871,19446,38951,...,7692,23063,11840,9047,18822,17118,33943,14178,18238,25089
3,hsa-let-7b,32968,98212,8744,20700,66434,50520,53566,66558,64379,...,19924,67137,66577,28602,90342,40366,56931,40220,37977,31517
4,hsa-let-7c,469,6967,1007,3039,1331,2165,7037,1814,1731,...,3000,786,7761,1588,2404,1153,7952,2728,5013,1147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,hsa-mir-9500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1877,hsa-mir-96,54,45,18,307,182,104,317,116,56,...,33,162,37,27,34,38,90,175,74,412
1878,hsa-mir-98,155,155,46,122,165,109,108,89,201,...,124,133,65,106,113,138,282,104,60,169
1879,hsa-mir-99a,170,2652,192,437,337,461,3249,244,553,...,754,289,2303,338,397,297,2318,939,1370,316


# Normal Tissue Analysis Files

In [12]:
# Process normal tissue related miRNA-Seq files
df_mir_normal_tissue = mir_files_processing(NORMAL_TISSUE_PATH)

# Store the DataFrame of processed normal tissue miRNA-Seq files in a CSV file
file_name = 'normal-tissue-mirna-reads.csv'
df_mir_normal_tissue.to_csv(f'{PROCESSED_DATA_PATH}/{file_name}', index=False)

In [13]:
# Print the DataFrame of processed normal tissue miRNA-Seq files
df_mir_normal_tissue

Unnamed: 0,mirna,2c658173-1941-45fe-baf5-ab67dfbcdfff,9c6e6db1-4096-47e2-9b77-abd2b4c7d180,ed3fd3e3-1b78-4a4f-8419-f4111c8c5c73,70c76e2f-3ce4-4864-8935-eaa51605612a,c1c6a9a5-73f2-414a-896d-4d3e564f6e59,133ae78e-fd83-4a13-9de6-28dbbe8e37c7,1540ed84-19b0-49d5-a728-3b04a29abe8a,5f2f42c5-e9d1-4a47-86a7-fd19308eca52,f1394f51-c3ab-45e8-87a2-fe641f00518e,...,5608dff0-72ab-468c-b25f-d411ad581052,6d88ce71-ecd1-4edf-8935-ea9137ed1ef1,a416945e-cd2d-43f6-aeff-d35cb66684df,6f2d4c85-893f-40ed-8ec1-a03b53ec59c1,641c7068-0f33-42dd-a4fe-cafb541826ee,dc731794-8d31-4fb8-a4a5-c894e788fa0b,28a15efb-f538-41fd-9517-0fb86cae338b,79ab1074-d958-4de9-98a6-934277350b07,6fef6e45-aa6e-41e6-98b1-5dbb9a67b31f,f2c479c0-f730-4751-a83c-8be312ea9f04
0,hsa-let-7a-1,30679,13371,46234,54834,43565,67039,54257,93179,35570,...,27032,15784,28515,65817,32214,29908,36349,19575,139569,26134
1,hsa-let-7a-2,30438,13341,46323,55455,43403,67213,54744,92741,35437,...,26846,15776,28825,65745,32014,29774,36384,19294,139951,26279
2,hsa-let-7a-3,30952,13395,45914,55384,44242,67055,55130,92957,35726,...,27388,15781,28729,65547,32448,30292,36816,19574,139472,26431
3,hsa-let-7b,171173,63077,146223,71425,96955,135813,75328,164726,93741,...,65008,76337,65819,138245,79689,161965,53413,109858,260188,83076
4,hsa-let-7c,31981,10632,29352,24301,21725,35164,21758,47779,24437,...,2189,15812,19732,37239,20069,30527,15435,20838,54744,13508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,hsa-mir-9500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1877,hsa-mir-96,17,2,26,9,17,22,4,11,24,...,53,7,8,10,7,18,9,7,18,10
1878,hsa-mir-98,76,32,103,245,177,168,126,144,112,...,66,56,97,147,84,97,207,49,163,70
1879,hsa-mir-99a,9813,10410,12587,4943,5457,14149,6742,15834,6966,...,621,7781,5328,12485,4982,10254,3944,7357,23157,5367
