In [111]:
# Upload the local data file to colab
from google.colab import files
uploaded = files.upload()

In [112]:
# Installation of necessary libraries
!pip install bravado
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install geneview
!pip install gprofiler-official
!pip install statsmodels
!pip install scikit-learn



In [113]:
# import libraries to use
import bravado
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geneview
from gprofiler import GProfiler
import statsmodels.api as sm
from sklearn import datasets, model_selection, metrics
import math


In [114]:
# Read the data in the glioma_mskcc_2019_clinical_data_simpler.csv file into a dataframe
clinical = pd.read_csv('glioma_mskcc_2019_clinical_data_simpler.csv', sep=",")
# Read the data in the data_mutations_short.csv file into a dataframe
mutations = pd.read_csv('data_mutations_short.csv', sep=",")
# Read the data in the Mutated_Genes.csv file into a dataframe
genes = pd.read_csv('Mutated_Genes.csv', sep=",")

In [123]:
# Parsing pathways csv
# Load the CSV file with default separator
pathways = pd.read_csv('pathways.csv')

# Step 1: Split the 'Pathway,Genes' column into 'Pathway' and 'Genes' based on the first comma
pathways[['Pathway', 'Genes']] = pathways['Pathway,Genes'].str.split(',', n=1, expand=True)

# Step 2: Remove the original 'Pathway,Genes' column (not needed anymore)
pathways.drop(columns=['Pathway,Genes'], inplace=True)

# Step 3: Remove quotes and split the 'Genes' into individual gene names
pathways['Genes'] = pathways['Genes'].str.replace('"', '').str.split(', ')

# Step 4: Explode the 'Genes' column so that each gene gets its own row
pathways_exploded = pathways.explode('Genes').reset_index(drop=True)

# Step 5: Clean up any leading/trailing spaces and convert gene names to uppercase
pathways_exploded['Genes'] = pathways_exploded['Genes'].str.strip().str.upper()

# Step 6: Show the cleaned and exploded pathways DataFrame
pathways_exploded

Unnamed: 0,Pathway,Genes
0,Astrocytic drivers,ATRX
1,Astrocytic drivers,TP53
2,Cell-cycle control,CCND2
3,Cell-cycle control,CDK4
4,Cell-cycle control,CDK6
5,Cell-cycle control,CDKN1A
6,Cell-cycle control,CDKN2A/B
7,Cell-cycle control,RB1
8,Chromatin remodeling,ARID1A
9,Chromatin remodeling,ARID1B


In [None]:
# show clinical
clinical

Unnamed: 0,Study ID,Patient ID,Sample ID,Actionable Lesion1,Diagnosis Age,Cancer Type,Cancer Type Detailed,Enhancing,Gene Panel,Histology,...,Overall Survival Status,Patient Display Name,Progress Free Survival (Months),Progression Free Status,Number of Samples Per Patient,Sample Type,Sex,TMB (nonsynonymous),WHO Classification of Diagnostic Tumor,WHO Grade
0,glioma_mskcc_2019,P-0000223,P-0000223-T01-IM3,No,17.0,Glioma,Diffuse Astrocytoma,Yes,IMPACT341,Anaplastic astrocytoma,...,1:DECEASED,glioma_mskcc_2019_552,116.0,1:PROGRESSION,1,Recurrence,Female,5.545777,"Diffuse astrocytoma, IDH-mutant",G3
1,glioma_mskcc_2019,P-0000378,P-0000378-T01-IM3,No,55.0,Glioma,Glioblastoma Multiforme,,IMPACT341,Glioblastoma,...,1:DECEASED,glioma_mskcc_2019_141,,,1,Primary,Male,5.545777,"Glioblastoma, IDH-wildtype",G4
2,glioma_mskcc_2019,P-0000486,P-0000486-T01-IM3,No,35.0,Glioma,Oligodendroglioma,No,IMPACT341,Anaplastic oligodendroglioma,...,0:LIVING,glioma_mskcc_2019_651,113.0,0:CENSORED,1,Recurrence,Female,3.327466,"Oligodendroglioma, IDH-mutant and 1p/19q-codel...",G3
3,glioma_mskcc_2019,P-0000500,P-0000500-T01-IM3,Yes,42.0,Glioma,Glioblastoma Multiforme,,IMPACT341,Glioblastoma,...,1:DECEASED,glioma_mskcc_2019_209,,,1,Recurrence,Male,115.352151,"Glioblastoma, IDH-wildtype",G4
4,glioma_mskcc_2019,P-0000572,P-0000572-T01-IM3,No,59.0,Glioma,Glioblastoma Multiforme,,IMPACT341,Glioblastoma,...,1:DECEASED,glioma_mskcc_2019_111,,,1,Primary,Male,7.764087,"Glioblastoma, IDH-wildtype",G4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
846,glioma_mskcc_2019,TRF079053,TRF079053,No,39.0,Glioma,Diffuse Astrocytoma,No,FoundationOneT7,Diffuse astrocytoma,...,0:LIVING,glioma_mskcc_2019_492,22.3,0:CENSORED,1,Primary,Female,6.677785,"Diffuse astrocytoma, IDH-mutant",G2
847,glioma_mskcc_2019,TRF079056,TRF079056,No,28.0,Glioma,Glioblastoma Multiforme,Yes,FoundationOneT7,Glioblastoma,...,0:LIVING,glioma_mskcc_2019_415,12.1,1:PROGRESSION,1,Primary,Male,6.677785,"Glioblastoma, IDH-mutant",G4
848,glioma_mskcc_2019,TRF083668,TRF083668,No,36.0,Glioma,Anaplastic Oligodendroglioma,No,FoundationOneT7,Anaplastic oligodendroglioma,...,0:LIVING,glioma_mskcc_2019_553,83.1,0:CENSORED,1,Primary,Female,2.504169,"Anaplastic oligodendroglioma, IDH-mutant and 1...",G3
849,glioma_mskcc_2019,TRF102171,TRF102171,No,55.0,Glioma,Glioblastoma Multiforme,,FoundationOneT7,Glioblastoma,...,1:DECEASED,glioma_mskcc_2019_250,19.9,1:PROGRESSION,1,Primary,Female,9.181955,"Glioblastoma, IDH-wildtype",G4


In [None]:
# show mutations
mutations

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Consequence,Tumor_Sample_Barcode,HGVSp_Short,Hotspot
0,BRAF,673.0,missense_variant,P-0010578-T01-IM5,p.V600E,0
1,KDM5C,8242.0,missense_variant,P-0010578-T01-IM5,p.P1545L,0
2,NOTCH1,4851.0,stop_gained,P-0010578-T01-IM5,p.Q300*,0
3,PIK3R2,5296.0,splice_acceptor_variant,P-0010578-T01-IM5,p.X272_splice,0
4,PIK3R2,5296.0,missense_variant,P-0010578-T01-IM5,p.E290K,0
...,...,...,...,...,...,...
8900,TERT,7015.0,upstream_gene_variant,P-0003079-T01-IM5,,0
8901,TERT,7015.0,upstream_gene_variant,TRF056398,,0
8902,TERT,7015.0,upstream_gene_variant,P-0002452-T01-IM3,,0
8903,TERT,7015.0,upstream_gene_variant,P-0004609-T01-IM5,,0


In [None]:
# information about dataframes
clinical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 851 entries, 0 to 850
Data columns (total 24 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Study ID                                851 non-null    object 
 1   Patient ID                              851 non-null    object 
 2   Sample ID                               851 non-null    object 
 3   Actionable Lesion1                      850 non-null    object 
 4   Diagnosis Age                           850 non-null    float64
 5   Cancer Type                             851 non-null    object 
 6   Cancer Type Detailed                    851 non-null    object 
 7   Enhancing                               343 non-null    object 
 8   Gene Panel                              851 non-null    object 
 9   Histology                               851 non-null    object 
 10  MGMT Status                             572 non-null    object

In [None]:
# information about dataframes
mutations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8905 entries, 0 to 8904
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Hugo_Symbol           8905 non-null   object 
 1   Entrez_Gene_Id        8824 non-null   float64
 2   Consequence           8905 non-null   object 
 3   Tumor_Sample_Barcode  8905 non-null   object 
 4   HGVSp_Short           8308 non-null   object 
 5   Hotspot               8905 non-null   int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 417.5+ KB


In [100]:
# information about dataframes
genes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 485 entries, 0 to 484
Data columns (total 7 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Gene                             485 non-null    object 
 1   MutSig(Q-value)                  0 non-null      float64
 2   # Mut                            485 non-null    int64  
 3   #                                485 non-null    int64  
 4   Profiled Samples                 485 non-null    int64  
 5   Freq                             485 non-null    object 
 6   Is Cancer Gene (source: OncoKB)  485 non-null    object 
dtypes: float64(1), int64(3), object(3)
memory usage: 26.6+ KB


In [None]:
#!pip install biopython mygene pandas

#import requests
#import pandas as pd
#from Bio.KEGG import REST
#import mygene

# Initialize MyGene.info for fetching NCBI Gene IDs
mg = mygene.MyGeneInfo()

# Assuming your DataFrame is called 'genes' and the first column contains gene names
gene_list = genes.iloc[:, 0].tolist()  # Extract the first column as a list of genes

# Function to get NCBI Gene ID using MyGene.info based on gene symbols
def get_ncbi_gene_id(gene):
    try:
        result = mg.query(gene, species='human', fields='entrezgene')
        return str(result['hits'][0]['entrezgene']) if 'hits' in result and result['hits'] else None
    except:
        return None  # Return None if no valid ID is found

# Function to query KEGG for pathways related to a gene using NCBI Gene ID
def get_kegg_pathways_for_gene(gene_id):
    try:
        # Query KEGG for the gene's pathways using NCBI Gene ID
        result = REST.kegg_link("pathway", f"hsa:{gene_id}").read()

        # Extract pathway IDs from KEGG result
        pathway_ids = [line.split('\t')[1] for line in result.strip().split('\n')]

        # Get the pathway names by querying KEGG for each pathway ID
        pathway_names = []
        for pathway_id in pathway_ids[:3]:  # Limit to the first 3 pathways
            pathway_info = REST.kegg_get(pathway_id).read()
            # Extract the pathway name from the KEGG entry
            for line in pathway_info.splitlines():
                if line.startswith('NAME'):
                    # Clean pathway name and remove '- Homo sapiens (human)' part
                    pathway_name = line.split('        ')[-1].split(' - ')[0]
                    pathway_names.append(pathway_name)
                    break

        return pathway_names
    except Exception as e:
        return []  # Return an empty list if an error occurs

# Dictionary to store gene-to-pathway mappings
gene_pathway_mapping = {}

# Loop through each gene, get NCBI Gene ID, and fetch pathways from KEGG
for gene in gene_list:
    gene_id = get_ncbi_gene_id(gene)  # Fetch NCBI Gene ID automatically
    if gene_id:  # Only proceed if a valid gene ID is found
        pathways = get_kegg_pathways_for_gene(gene_id)
        gene_pathway_mapping[gene] = pathways

# Convert the gene-pathway mapping into a DataFrame
gene_pathway_df = pd.DataFrame(list(gene_pathway_mapping.items()), columns=['Gene', 'Pathways'])

# Print the results
print(gene_pathway_df)

In [126]:
# Assuming the 'mutations' and 'pathways_exploded' DataFrames are already loaded

# Step 1: Clean and standardize 'Hugo_Symbol' in the 'mutations' DataFrame
mutations['Hugo_Symbol'] = mutations['Hugo_Symbol'].str.strip().str.upper()

# Step 2: Clean and standardize the 'Genes' in the 'pathways_exploded' DataFrame
pathways_exploded['Genes'] = pathways_exploded['Genes'].str.strip().str.upper()

# Step 3: Check if TERT exists in both DataFrames before merging
print("TERT in mutations:", 'TERT' in mutations['Hugo_Symbol'].values)
print("TERT in pathways:", 'TERT' in pathways_exploded['Genes'].values)

# Step 4: Merge the 'mutations' and 'pathways_exploded' DataFrames
mutations_with_pathways = pd.merge(
    mutations,              # The mutations DataFrame
    pathways_exploded,       # The exploded pathways DataFrame
    left_on='Hugo_Symbol',   # Column in mutations that contains gene names
    right_on='Genes',        # Column in pathways that contains gene names
    how='left'               # Use 'left' join to keep all mutations, even if no pathway is found
)

# Step 5: Fill missing pathways with 'none'
mutations_with_pathways['Pathway'].fillna('none', inplace=True)

# Step 6: Drop the 'Genes' column from the resulting DataFrame since it's redundant
mutations_with_pathways.drop(columns=['Genes'], inplace=True)

# Step 7: Show the row where Hugo_Symbol is 'TERT' to verify pathway assignment
print(mutations_with_pathways[mutations_with_pathways['Hugo_Symbol'] == 'TERT'])

TERT in mutations: True
TERT in pathways: True
     Hugo_Symbol  Entrez_Gene_Id            Consequence Tumor_Sample_Barcode  \
518         TERT          7015.0       missense_variant    P-0000944-T01-IM3   
519         TERT          7015.0       missense_variant    P-0000944-T01-IM3   
1255        TERT          7015.0       missense_variant    P-0009634-T01-IM5   
1260        TERT          7015.0       missense_variant    P-0003344-T01-IM5   
1402        TERT          7015.0       missense_variant    P-0000500-T01-IM3   
...          ...             ...                    ...                  ...   
9666        TERT          7015.0  upstream_gene_variant    P-0003079-T01-IM5   
9667        TERT          7015.0  upstream_gene_variant            TRF056398   
9668        TERT          7015.0  upstream_gene_variant    P-0002452-T01-IM3   
9669        TERT          7015.0  upstream_gene_variant    P-0004609-T01-IM5   
9670        TERT          7015.0  upstream_gene_variant    P-0004771-T01-

In [127]:
mutations_with_pathways

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Consequence,Tumor_Sample_Barcode,HGVSp_Short,Hotspot,Pathway
0,BRAF,673.0,missense_variant,P-0010578-T01-IM5,p.V600E,0,RTK-RAS
1,KDM5C,8242.0,missense_variant,P-0010578-T01-IM5,p.P1545L,0,none
2,NOTCH1,4851.0,stop_gained,P-0010578-T01-IM5,p.Q300*,0,NOTCH pathway
3,PIK3R2,5296.0,splice_acceptor_variant,P-0010578-T01-IM5,p.X272_splice,0,none
4,PIK3R2,5296.0,missense_variant,P-0010578-T01-IM5,p.E290K,0,none
...,...,...,...,...,...,...,...
9666,TERT,7015.0,upstream_gene_variant,P-0003079-T01-IM5,,0,Telomere maintenance
9667,TERT,7015.0,upstream_gene_variant,TRF056398,,0,Telomere maintenance
9668,TERT,7015.0,upstream_gene_variant,P-0002452-T01-IM3,,0,Telomere maintenance
9669,TERT,7015.0,upstream_gene_variant,P-0004609-T01-IM5,,0,Telomere maintenance


In [134]:
# Step 1: Clean and standardize the 'Tumor_Sample_Barcode' and 'Sample ID' columns
mutations['Tumor_Sample_Barcode'] = mutations['Tumor_Sample_Barcode'].str.strip().str.upper()
clinical['Sample ID'] = clinical['Sample ID'].str.strip().str.upper()

# Step 2: Check if there are unmatched entries in 'mutations' and 'clinics'
unmatched_samples = mutations[~mutations['Tumor_Sample_Barcode'].isin(clinical['Sample ID'])]
print("Unmatched samples in mutations:", unmatched_samples)



Unmatched samples in mutations:      Hugo_Symbol  Entrez_Gene_Id              Consequence  \
0           BRAF           673.0         missense_variant   
1          KDM5C          8242.0         missense_variant   
2         NOTCH1          4851.0              stop_gained   
3         PIK3R2          5296.0  splice_acceptor_variant   
4         PIK3R2          5296.0         missense_variant   
...          ...             ...                      ...   
8802        TERT          7015.0    upstream_gene_variant   
8803        TERT          7015.0    upstream_gene_variant   
8807        TERT          7015.0    upstream_gene_variant   
8885        TERT          7015.0    upstream_gene_variant   
8886        TERT          7015.0    upstream_gene_variant   

     Tumor_Sample_Barcode    HGVSp_Short  Hotspot  
0       P-0010578-T01-IM5        p.V600E        0  
1       P-0010578-T01-IM5       p.P1545L        0  
2       P-0010578-T01-IM5        p.Q300*        0  
3       P-0010578-T01-IM5  

In [140]:
# Step 1: Clean and standardize the 'Tumor_Sample_Barcode' and 'Sample ID' columns
mutations['Tumor_Sample_Barcode'] = mutations['Tumor_Sample_Barcode'].str.strip().str.upper()
clinical['Sample ID'] = clinical['Sample ID'].str.strip().str.upper()

# Step 2: Identify matched samples
matched_samples = mutations[mutations['Tumor_Sample_Barcode'].isin(clinical['Sample ID'])]
matched_count = len(matched_samples)

# Step 3: Identify unmatched samples
unmatched_samples = mutations[~mutations['Tumor_Sample_Barcode'].isin(clinical['Sample ID'])]
unmatched_count = len(unmatched_samples)

# Step 4: Optional - Identify partially matched samples (trim the sample barcode)
# Here we assume the first 10 characters might help find partial matches (adjust as needed)
mutations['Trimmed_Barcode'] = mutations['Tumor_Sample_Barcode'].str[:10]  # Adjust length based on actual data
partially_matched_samples = mutations[mutations['Trimmed_Barcode'].isin(clinical['Sample ID'])]
partially_matched_count = len(partially_matched_samples)

# Step 5: Print out the results
print(f"Total samples in mutations: {len(mutations)}")
print(f"Matched samples: {matched_count}")
print(f"Unmatched samples: {unmatched_count}")

Total samples in mutations: 8905
Matched samples: 7036
Unmatched samples: 1869


In [141]:
!pip install fuzzywuzzy[speedup]
from fuzzywuzzy import process
import pandas as pd

# Step 1: Clean and standardize the 'Tumor_Sample_Barcode' in mutations and 'Sample ID' in clinical
mutations['Tumor_Sample_Barcode'] = mutations['Tumor_Sample_Barcode'].str.strip().str.upper()
clinical['Sample ID'] = clinical['Sample ID'].str.strip().str.upper()

# Step 2: Identify matched samples
matched_samples = mutations[mutations['Tumor_Sample_Barcode'].isin(clinical['Sample ID'])]
matched_count = len(matched_samples)

# Step 3: Identify unmatched samples
unmatched_samples = mutations[~mutations['Tumor_Sample_Barcode'].isin(clinical['Sample ID'])]
unmatched_count = len(unmatched_samples)

# Step 4: Fuzzy match unmatched samples to 'Sample ID' in clinical
def fuzzy_match(sample_barcode, sample_id_list, threshold=90):
    """ Returns the closest matches with a score above the threshold """
    match = process.extractOne(sample_barcode, sample_id_list, score_cutoff=threshold)
    return match[0] if match else None

# Apply fuzzy matching to the unmatched samples
unmatched_samples['Fuzzy_Match'] = unmatched_samples['Tumor_Sample_Barcode'].apply(
    lambda x: fuzzy_match(x, clinical['Sample ID'].tolist())
)

# Step 5: Count partially matched samples (those that got a fuzzy match)
partially_matched_samples = unmatched_samples[unmatched_samples['Fuzzy_Match'].notna()]
partially_matched_count = len(partially_matched_samples)

# Step 6: Print out the results
print(f"Total samples in mutations: {len(mutations)}")
print(f"Matched samples: {matched_count}")
print(f"Unmatched samples: {unmatched_count}")
print(f"Partially matched samples (fuzzy match): {partially_matched_count}")

# Optionally, save partially matched samples for review

Collecting fuzzywuzzy[speedup]
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-levenshtein>=0.12 (from fuzzywuzzy[speedup])
  Downloading python_Levenshtein-0.26.0-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.0 (from python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading levenshtein-0.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.0->python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.0-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Do

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_samples['Fuzzy_Match'] = unmatched_samples['Tumor_Sample_Barcode'].apply(


In [135]:
# Step 3: Merge the cleaned DataFrames
merged_df = pd.merge(
    mutations,             # The mutations DataFrame
    clinical,               # The clinics DataFrame
    left_on='Tumor_Sample_Barcode',  # Column from mutations
    right_on='Sample ID',            # Column from clinics
    how='left'                       # Use 'left' join to keep all rows from mutations
)

# Step 4: Show the merged DataFrame and the unmatched rows
merged_df

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Consequence,Tumor_Sample_Barcode,HGVSp_Short,Hotspot,Study ID,Patient ID,Sample ID,Actionable Lesion1,...,Overall Survival Status,Patient Display Name,Progress Free Survival (Months),Progression Free Status,Number of Samples Per Patient,Sample Type,Sex,TMB (nonsynonymous),WHO Classification of Diagnostic Tumor,WHO Grade
0,BRAF,673.0,missense_variant,P-0010578-T01-IM5,p.V600E,0,,,,,...,,,,,,,,,,
1,KDM5C,8242.0,missense_variant,P-0010578-T01-IM5,p.P1545L,0,,,,,...,,,,,,,,,,
2,NOTCH1,4851.0,stop_gained,P-0010578-T01-IM5,p.Q300*,0,,,,,...,,,,,,,,,,
3,PIK3R2,5296.0,splice_acceptor_variant,P-0010578-T01-IM5,p.X272_splice,0,,,,,...,,,,,,,,,,
4,PIK3R2,5296.0,missense_variant,P-0010578-T01-IM5,p.E290K,0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8900,TERT,7015.0,upstream_gene_variant,P-0003079-T01-IM5,,0,glioma_mskcc_2019,P-0003079,P-0003079-T01-IM5,No,...,1:DECEASED,glioma_mskcc_2019_94,,,1.0,Primary,Male,1.957439,"Glioblastoma, IDH-wildtype",G4
8901,TERT,7015.0,upstream_gene_variant,TRF056398,,0,glioma_mskcc_2019,TRF056398,TRF056398,No,...,1:DECEASED,glioma_mskcc_2019_95,7.96,1:PROGRESSION,1.0,Primary,Male,5.008339,"Glioblastoma, IDH-wildtype",G4
8902,TERT,7015.0,upstream_gene_variant,P-0002452-T01-IM3,,0,glioma_mskcc_2019,P-0002452,P-0002452-T01-IM3,Yes,...,1:DECEASED,glioma_mskcc_2019_96,6.28,1:PROGRESSION,1.0,Primary,Female,2.218311,"Glioblastoma, IDH-wildtype",G4
8903,TERT,7015.0,upstream_gene_variant,P-0004609-T01-IM5,,0,glioma_mskcc_2019,P-0004609,P-0004609-T01-IM5,Yes,...,1:DECEASED,glioma_mskcc_2019_97,,,1.0,Primary,Male,3.914879,"Glioblastoma, IDH-wildtype",G4


In [142]:

# Optionally drop rows with missing clinical data
merged_df_clean = merged_df.dropna(subset=['Sample ID'])

# Check the number of remaining rows after dropping NAs
print(f"Remaining samples after dropping unmatched rows: {len(merged_df_clean)}")

Remaining samples after dropping unmatched rows: 7036
