In [1]:
!conda env create -f environment.yml
!pip install ipykernel
!python -m ipykernel install --user --name JupyterLab --display-name "Python (JupyterLab)"


CondaValueError: prefix already exists: /home/kbanerj1/.conda/envs/JupyterLab

Installed kernelspec JupyterLab in /home/kbanerj1/.local/share/jupyter/kernels/jupyterlab


In [1]:
import os
import pandas as pds
import glob
import time

In [2]:
os.chdir("/home/kbanerj1/TCGA_ML")
# Verify the current directory
print("Current Directory:", os.getcwd())

Current Directory: /home/kbanerj1/TCGA_ML


## Reading Gene Data

In [3]:
def read_gene_df(filepath, filename):
    '''
    Function that reads a gene file and processes the gene data with the file name.
    Skips rows with unwanted gene names (e.g., N_unmapped, N_multimapping, N_noFeature, N_ambiguous).
    Retains only the tpm_unstranded column.
    '''
    
    df1 = pd.read_csv(filepath, sep='\t', skiprows=1)
    unwanted_genes = ['N_unmapped', 'N_multimapping', 'N_noFeature', 'N_ambiguous']
    df1 = df1[~df1['gene_id'].isin(unwanted_genes)]
    df1 = df1[['gene_name', 'tpm_unstranded']]
    df1.set_index('gene_name', inplace=True)
    df1 = df1.T
    processed_filename = filename.split('.')[0]
    df1.index = [processed_filename]
    df1 = df1.reset_index()
    df1.rename(columns={'index': 'file_name'}, inplace=True)
    
    return df1

In [4]:
# Get the list of file paths
filelist = glob.glob("./data/*/*.tsv")
all_dfs = []

for filepath in filelist:
    filename =  os.path.basename(filepath)  # get filename from path
    df = read_gene_df(filepath, filename)
    all_dfs.append(df) 

combined_df = pd.concat(all_dfs, ignore_index=True)
combined_df

gene_name,file_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,AL451106.1,AC092910.4,AC073611.1,AC136977.1,AC078856.1,AC008763.4,AL592295.6,AC006486.3,AL391628.1,AP006621.6
0,0ece26e3-b25e-4b16-b0f8-6a62d128b084,50.8266,0.0394,68.0495,8.9287,1.9557,7.0591,4.9134,2.8817,11.1888,...,0.0,0.0000,0.3160,0.0,0.0000,0.0,17.8409,0.0,0.0483,4.9789
1,fd950816-2bc3-4e25-90b4-6a6a049cd809,43.9793,0.1093,66.7390,4.1823,2.8904,19.2646,12.2063,66.9193,24.1118,...,0.0,0.0000,0.5660,0.0,0.0000,0.0,9.3709,0.0,0.0335,1.8239
2,b8d729a4-658d-49c7-b6c1-3490041ad3e1,53.2076,0.1048,54.1207,5.8853,4.1955,5.0059,2.5866,11.0124,31.2974,...,0.0,0.0000,0.6104,0.0,0.0000,0.0,26.3106,0.0,0.0402,1.7946
3,deacae6a-5f74-4a07-8ee7-5f378fa5ea41,57.5719,0.0000,86.4358,4.7965,3.3714,3.2967,3.0484,9.9543,12.3831,...,0.0,0.0000,0.2795,0.0,0.0000,0.0,19.4321,0.0,0.0151,6.4643
4,36df1baa-1477-4192-a6f1-51585f29f6b2,26.5473,0.0861,61.5650,5.0289,2.1703,7.7559,9.7178,4.3662,15.7490,...,0.0,0.0000,0.4569,0.0,0.0000,0.0,17.3981,0.0,0.0462,7.1829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,edf2bc5d-3b50-411a-819e-aac3aa1207aa,67.4699,1.6255,78.4940,7.7254,4.3927,13.4459,8.2017,29.0845,30.7405,...,0.0,0.0000,0.5189,0.0,0.0000,0.0,18.2797,0.0,0.1102,1.2953
703,ee33c2fd-140b-4b4c-8820-f711facc6b4c,78.6831,1.8529,60.1268,6.2156,5.1865,8.4333,21.7751,45.8958,15.1308,...,0.0,0.0000,0.3599,0.0,0.0000,0.0,16.4865,0.0,0.0812,1.3372
704,814e81ff-bb80-4c3c-bd94-d86c89de9a36,9.4880,0.0000,36.2045,2.9239,3.0229,1.2153,49.4383,33.9834,7.3017,...,0.0,3.1495,0.1907,0.0,7.7446,0.0,2.3007,0.0,0.0540,1.2654
705,200bfb25-663b-4216-be51-bafaea7c5d4a,37.7357,0.0758,245.2148,6.3458,12.5770,0.0000,0.0000,68.7707,9.0761,...,0.0,4.9733,0.7199,0.0,6.7262,0.0,8.9586,0.0,0.0349,0.7993


In [5]:
## sanity check for columns that have all zeros across rows (samples)
zero_read_genes = combined_df.columns[(combined_df == 0).all(axis=0)]
zero_read_genes

Index(['CD99', 'NME1-NME2', 'Z83844.1', 'MCTS2P', 'CORO7-PAM16', 'ICAM4',
       'AC004837.1', 'AL021546.1', 'PEDS1-UBE2V1', 'VAMP7',
       ...
       'CDR1', 'AC114982.2', 'AC114402.2', 'AC084756.2', 'AL031178.2',
       'ACTL10', 'AC119733.1', 'AL451106.1', 'AC136977.1', 'AC006486.3'],
      dtype='object', name='gene_name', length=2657)

In [7]:
# Drop zero count genes
combined_df = combined_df.drop(columns=zero_read_genes)
combined_df

gene_name,file_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,AC010980.1,AC007511.1,AC004233.4,AC092910.4,AC073611.1,AC078856.1,AC008763.4,AL592295.6,AL391628.1,AP006621.6
0,0ece26e3-b25e-4b16-b0f8-6a62d128b084,50.8266,0.0394,68.0495,8.9287,1.9557,7.0591,4.9134,2.8817,11.1888,...,0.1370,0.0000,0.3995,0.0000,0.3160,0.0000,0.0,17.8409,0.0483,4.9789
1,fd950816-2bc3-4e25-90b4-6a6a049cd809,43.9793,0.1093,66.7390,4.1823,2.8904,19.2646,12.2063,66.9193,24.1118,...,0.8968,0.0000,0.1109,0.0000,0.5660,0.0000,0.0,9.3709,0.0335,1.8239
2,b8d729a4-658d-49c7-b6c1-3490041ad3e1,53.2076,0.1048,54.1207,5.8853,4.1955,5.0059,2.5866,11.0124,31.2974,...,0.4690,0.1319,0.1063,0.0000,0.6104,0.0000,0.0,26.3106,0.0402,1.7946
3,deacae6a-5f74-4a07-8ee7-5f378fa5ea41,57.5719,0.0000,86.4358,4.7965,3.3714,3.2967,3.0484,9.9543,12.3831,...,1.2689,0.0000,0.1992,0.0000,0.2795,0.0000,0.0,19.4321,0.0151,6.4643
4,36df1baa-1477-4192-a6f1-51585f29f6b2,26.5473,0.0861,61.5650,5.0289,2.1703,7.7559,9.7178,4.3662,15.7490,...,0.6208,0.0000,0.3494,0.0000,0.4569,0.0000,0.0,17.3981,0.0462,7.1829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,edf2bc5d-3b50-411a-819e-aac3aa1207aa,67.4699,1.6255,78.4940,7.7254,4.3927,13.4459,8.2017,29.0845,30.7405,...,0.0752,0.0000,0.2302,0.0000,0.5189,0.0000,0.0,18.2797,0.1102,1.2953
703,ee33c2fd-140b-4b4c-8820-f711facc6b4c,78.6831,1.8529,60.1268,6.2156,5.1865,8.4333,21.7751,45.8958,15.1308,...,0.3950,0.0000,0.0000,0.0000,0.3599,0.0000,0.0,16.4865,0.0812,1.3372
704,814e81ff-bb80-4c3c-bd94-d86c89de9a36,9.4880,0.0000,36.2045,2.9239,3.0229,1.2153,49.4383,33.9834,7.3017,...,0.0796,0.0000,0.0000,3.1495,0.1907,7.7446,0.0,2.3007,0.0540,1.2654
705,200bfb25-663b-4216-be51-bafaea7c5d4a,37.7357,0.0758,245.2148,6.3458,12.5770,0.0000,0.0000,68.7707,9.0761,...,0.1131,0.0000,0.2052,4.9733,0.7199,6.7262,0.0,8.9586,0.0349,0.7993


## Read clinical and sample data

In [8]:
# Load clinical data 
clinical_data = pd.read_csv("./metadata/clinical.tsv", sep='\t')
clinical_data.head()

Unnamed: 0,case_id,case_submitter_id,project_id,age_at_index,age_is_obfuscated,cause_of_death,cause_of_death_source,country_of_residence_at_enrollment,days_to_birth,days_to_death,...,treatment_arm,treatment_dose,treatment_dose_units,treatment_effect,treatment_effect_indicator,treatment_frequency,treatment_intent_type,treatment_or_therapy,treatment_outcome,treatment_type
0,d420e653-3fb2-432b-9e81-81232a80264d,HCM-BROD-0210-C71,HCMI-CMDC,'--,False,Cancer Related,'--,'--,-19586,481,...,'--,'--,'--,'--,'--,'--,Adjuvant,no,'--,Immunotherapy (Including Vaccines)
1,d420e653-3fb2-432b-9e81-81232a80264d,HCM-BROD-0210-C71,HCMI-CMDC,'--,False,Cancer Related,'--,'--,-19586,481,...,'--,'--,'--,'--,'--,'--,Adjuvant,no,'--,Targeted Molecular Therapy
2,d420e653-3fb2-432b-9e81-81232a80264d,HCM-BROD-0210-C71,HCMI-CMDC,'--,False,Cancer Related,'--,'--,-19586,481,...,'--,'--,'--,'--,'--,'--,Adjuvant,no,'--,"Radiation Therapy, NOS"
3,d420e653-3fb2-432b-9e81-81232a80264d,HCM-BROD-0210-C71,HCMI-CMDC,'--,False,Cancer Related,'--,'--,-19586,481,...,'--,'--,'--,'--,'--,'--,Adjuvant,yes,'--,Chemotherapy
4,d420e653-3fb2-432b-9e81-81232a80264d,HCM-BROD-0210-C71,HCMI-CMDC,'--,False,Cancer Related,'--,'--,-19586,481,...,'--,'--,'--,'--,'--,'--,Neoadjuvant,no,'--,'--


In [9]:
clinical_data.shape

(1386, 158)

In [10]:
clinical_data.groupby("primary_diagnosis")['case_id'].count()

primary_diagnosis
Astrocytoma, NOS                 118
Astrocytoma, anaplastic          240
Glioblastoma                     434
Gliosarcoma                        6
Mixed glioma                     242
Oligodendroglioma, NOS           202
Oligodendroglioma, anaplastic    144
Name: case_id, dtype: int64

In [11]:
unique_cases_in_clinical = clinical_data['case_submitter_id'].nunique()
case_duplicates = clinical_data['case_submitter_id'].value_counts()
# Display the unique case count and the distribution of duplicates
unique_cases_in_clinical, case_duplicates.head()

(663,
 case_submitter_id
 HCM-BROD-0213-C71    8
 HCM-BROD-0104-C71    7
 HCM-BROD-0106-C71    7
 HCM-BROD-0420-C71    7
 HCM-BROD-0198-C71    7
 Name: count, dtype: int64)

In [12]:
# drop duplicates based on 'case_submitter_id'
clinical_data_dedup = clinical_data.drop_duplicates(subset='case_submitter_id', keep='first')
clinical_data_dedup.shape

(663, 158)

## Simplify labels to broader categories

In [13]:
# simplify labels by mapping complex or specific diagnoses to broader categories - dropping 'mixed gliomas'
diagnosis_map = {
    "Astrocytoma, NOS": "Astrocytoma",
    "Astrocytoma, anaplastic": "Astrocytoma",
    "Oligodendroglioma, NOS": "Oligodendroglioma",
    "Oligodendroglioma, anaplastic": "Oligodendroglioma",
    "Glioblastoma": "Glioblastoma"
}

# create a new 'label' column based on the mapping
clinical_data_dedup.loc[:, 'label'] = clinical_data_dedup['primary_diagnosis'].map(diagnosis_map)
clinical_data_dedup.groupby(["primary_diagnosis", "label"])['case_submitter_id'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinical_data_dedup.loc[:, 'label'] = clinical_data_dedup['primary_diagnosis'].map(diagnosis_map)


primary_diagnosis              label            
Astrocytoma, NOS               Astrocytoma           59
Astrocytoma, anaplastic        Astrocytoma          120
Glioblastoma                   Glioblastoma         189
Oligodendroglioma, NOS         Oligodendroglioma    101
Oligodendroglioma, anaplastic  Oligodendroglioma     72
Name: case_submitter_id, dtype: int64

In [14]:
sample_data = pd.read_csv("./metadata/gdc_sample_sheet.tsv", sep='\t')
# Simplify filenames in sample_data for merging
sample_data['filename_short'] = sample_data['File Name'].apply(lambda x: x.split('.')[0])
sample_data = pd.merge(sample_data, combined_df, left_on='filename_short', right_on='file_name', how='inner')

# Retain relevant columns in sample_data
columns_to_retain = ['File ID', 'File Name', 'Data Category', 'Data Type', 'Project ID', 'Case ID', 'Sample ID', 'Sample Type', 'filename_short']
sample_data = sample_data[columns_to_retain]
sample_data.head()

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type,filename_short
0,92cdbe72-8af4-4e98-87be-9c437978eb0c,2fb43643-4dc3-41b3-a299-815f5c8b0d72.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,C3L-03748,C3L-03748-04,Primary Tumor,2fb43643-4dc3-41b3-a299-815f5c8b0d72
1,99e99371-68ea-49e0-adea-cd59a61e4539,502a976d-ce43-4b4a-b813-58eaa2b0c387.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-BROD-0106-C71,HCM-BROD-0106-C71-02A,Recurrent Tumor,502a976d-ce43-4b4a-b813-58eaa2b0c387
2,a3f23052-28f1-44b9-b2e5-54ca544f16ed,8f11a23c-f4da-462f-95e9-4c85e56f603a.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-BROD-0106-C71,HCM-BROD-0106-C71-85A,Next Generation Cancer Model,8f11a23c-f4da-462f-95e9-4c85e56f603a
3,dee27d65-9700-4d94-bd5d-8d67a34747a0,8d272434-7ba5-4287-9f1f-f4fac4e1fc7b.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-BROD-0103-C71,HCM-BROD-0103-C71-85B,Next Generation Cancer Model,8d272434-7ba5-4287-9f1f-f4fac4e1fc7b
4,3f5c7c90-3c15-46fd-8d33-a8cff36325c4,014c0152-04c3-4370-9fc6-e42bdc7f3f79.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-BROD-0210-C71,HCM-BROD-0210-C71-02A,Recurrent Tumor,014c0152-04c3-4370-9fc6-e42bdc7f3f79


In [15]:
sample_data.shape

(707, 9)

In [16]:
# Merge with the clinical data to get the primary diagnosis and label
merged_data_dedup = sample_data.merge(
    clinical_data_dedup[['case_submitter_id', 'primary_diagnosis', 'label']], 
    how='left', 
    left_on='Case ID', 
    right_on='case_submitter_id'
)

merged_data_dedup

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type,filename_short,case_submitter_id,primary_diagnosis,label
0,92cdbe72-8af4-4e98-87be-9c437978eb0c,2fb43643-4dc3-41b3-a299-815f5c8b0d72.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,C3L-03748,C3L-03748-04,Primary Tumor,2fb43643-4dc3-41b3-a299-815f5c8b0d72,C3L-03748,Glioblastoma,Glioblastoma
1,99e99371-68ea-49e0-adea-cd59a61e4539,502a976d-ce43-4b4a-b813-58eaa2b0c387.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-BROD-0106-C71,HCM-BROD-0106-C71-02A,Recurrent Tumor,502a976d-ce43-4b4a-b813-58eaa2b0c387,HCM-BROD-0106-C71,Glioblastoma,Glioblastoma
2,a3f23052-28f1-44b9-b2e5-54ca544f16ed,8f11a23c-f4da-462f-95e9-4c85e56f603a.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-BROD-0106-C71,HCM-BROD-0106-C71-85A,Next Generation Cancer Model,8f11a23c-f4da-462f-95e9-4c85e56f603a,HCM-BROD-0106-C71,Glioblastoma,Glioblastoma
3,dee27d65-9700-4d94-bd5d-8d67a34747a0,8d272434-7ba5-4287-9f1f-f4fac4e1fc7b.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-BROD-0103-C71,HCM-BROD-0103-C71-85B,Next Generation Cancer Model,8d272434-7ba5-4287-9f1f-f4fac4e1fc7b,HCM-BROD-0103-C71,Glioblastoma,Glioblastoma
4,3f5c7c90-3c15-46fd-8d33-a8cff36325c4,014c0152-04c3-4370-9fc6-e42bdc7f3f79.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-BROD-0210-C71,HCM-BROD-0210-C71-02A,Recurrent Tumor,014c0152-04c3-4370-9fc6-e42bdc7f3f79,HCM-BROD-0210-C71,Glioblastoma,Glioblastoma
...,...,...,...,...,...,...,...,...,...,...,...,...
702,73d57ab1-1757-4af3-b229-90050dcdb9e0,1fbd55b4-c08b-44b8-940e-6538cc79ef68.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,C3L-00104,C3L-00104-01,Primary Tumor,1fbd55b4-c08b-44b8-940e-6538cc79ef68,C3L-00104,Glioblastoma,Glioblastoma
703,0545b5c3-1dfb-4e0e-b956-97ba018a1406,9b4b6b42-09e1-4783-b374-532b9df2e207.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,C3L-03681,C3L-03681-03,Primary Tumor,9b4b6b42-09e1-4783-b374-532b9df2e207,C3L-03681,Glioblastoma,Glioblastoma
704,c46154dd-81dd-4203-960c-da74e393eccd,cb6d1cb9-0b4b-4938-902b-39d02b07faf7.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,"C3L-02705, C3L-02705, C3L-02705","C3L-02705-71, C3L-02705-75, C3L-02705-72","Primary Tumor, Primary Tumor, Primary Tumor",cb6d1cb9-0b4b-4938-902b-39d02b07faf7,,,
705,04a95ed6-a66b-4a95-9b12-da3b48ab7ff6,0ab80f6a-8139-42a9-886c-e8eb5cc9de88.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,"C3L-02704, C3L-02704, C3L-02704","C3L-02704-71, C3L-02704-73, C3L-02704-72","Primary Tumor, Primary Tumor, Primary Tumor",0ab80f6a-8139-42a9-886c-e8eb5cc9de88,,,


In [17]:
merged_data_dedup.shape

(707, 12)

In [18]:
# Group by both primary_diagnosis and label, then count unique case_submitter_ids
merged_data_dedup.groupby(['primary_diagnosis', 'label'])['case_submitter_id'].count()

primary_diagnosis              label            
Astrocytoma, NOS               Astrocytoma           61
Astrocytoma, anaplastic        Astrocytoma          121
Glioblastoma                   Glioblastoma         209
Oligodendroglioma, NOS         Oligodendroglioma    108
Oligodendroglioma, anaplastic  Oligodendroglioma     76
Name: case_submitter_id, dtype: int64

In [19]:
# Filter out samples with non-empty labels
merged_data_dedup = merged_data_dedup[merged_data_dedup['label'].notna()]
merged_data_dedup.shape

(575, 12)

## Keep samples that have valid labels

In [20]:
# merge combined_df with merged_data_dedup to get 'Sample ID'
combined_df_with_sample_id = pd.merge(combined_df, merged_data_dedup[['filename_short', 'Sample ID']], 
    left_on='file_name', right_on='filename_short', how='left')

# Set 'Sample ID' as index, drop unnecessary columns
combined_df_with_sample_id = combined_df_with_sample_id.set_index('Sample ID')
combined_df_with_sample_id = combined_df_with_sample_id.drop(['filename_short', 'file_name'], axis=1)

# Filter combined_df_with_sample_id to only include non-empty labels
filtered_sample_ids = merged_data_dedup['Sample ID'].dropna().unique()
combined_df_with_sample_id = combined_df_with_sample_id.loc[filtered_sample_ids]

# Display the final shape
combined_df_with_sample_id.shape

(575, 56940)

## Output the Processed Data

In [22]:
# Define extraction directory
extraction_dir = "./processed"
os.makedirs(extraction_dir, exist_ok=True)

data_file_path = os.path.join(extraction_dir, 'data.csv')
combined_df_with_sample_id.to_csv(data_file_path, index=True)
print(f"Data saved to {data_file_path}")

Data saved to ./processed/data.csv


In [24]:
# Extract and save the filtered labels
y = merged_data_dedup.set_index('Sample ID')['label']
labels_file_path = os.path.join(extraction_dir, 'glioma_labels.csv')
y.to_csv(labels_file_path, index=True)
print(f"Labels saved to {labels_file_path}")

Labels saved to ./processed/glioma_labels.csv
