In [None]:
import json
import pandas as pd

# load the JSON file with hallmark gene sets
jsonPath = '/content/h.all.v2025.1.Hs.json'  # Adjust path as needed
with open(jsonPath, 'r') as f:
    hallmarkData = json.load(f)

# define hallmark sets of interest
targetHallmarks = {
    "HALLMARK_E2F_TARGETS",
    "HALLMARK_G2M_CHECKPOINT",
    "HALLMARK_MYC_TARGETS_V1",
    "HALLMARK_MYC_TARGETS_V2",
    "HALLMARK_MTORC1_SIGNALING"
}

# collect all hallmark genes into a single set
hallmarkGenes = set()
for hallmarkName in targetHallmarks:
    details = hallmarkData.get(hallmarkName, {})
    genes = details.get('geneSymbols', [])
    # remove leading single quote if any
    cleanedGenes = [gene.lstrip("'") for gene in genes]
    hallmarkGenes.update(cleanedGenes)

# print counts per hallmark
for hallmarkName in targetHallmarks:
    details = hallmarkData.get(hallmarkName, {})
    genes = details.get('geneSymbols', [])
    print(f"{hallmarkName}: {len(genes)} genes")

# load Multivariate - CNA data
multivariateCnaPath='/content/Multivariate - CNA.xlsx'   # Adjust path as needed
multiCnaDf = pd.read_excel(multivariateCnaPath, engine='openpyxl')

# remove leading single quote from 'Gene' column
multiCnaDf['Gene'] = multiCnaDf['Gene'].astype(str).str.lstrip("'")

# filter rows where Gene is in hallmark gene list
filteredMultiCnaDf = multiCnaDf[multiCnaDf['Gene'].isin(hallmarkGenes)]

print(f"Rows before filtering: {len(multiCnaDf)}")
print(f"Rows after filtering: {len(filteredMultiCnaDf)}")

# number of unique genes left after filtering
uniqueGenesLeft = filteredMultiCnaDf['Gene'].nunique()
print(f"Number of unique genes left after filtering: {uniqueGenesLeft}")




HALLMARK_G2M_CHECKPOINT: 200 genes
HALLMARK_MYC_TARGETS_V1: 200 genes
HALLMARK_MYC_TARGETS_V2: 58 genes
HALLMARK_E2F_TARGETS: 200 genes
HALLMARK_MTORC1_SIGNALING: 200 genes
Rows before filtering: 57185
Rows after filtering: 655
Number of unique genes left after filtering: 655


In [None]:
# save filtered data
outputPath = '/content/Multivariate_CNA_filtered_by_hallmark.csv'
filteredMultiCnaDf.to_csv(outputPath, index=False)

print(f"Filtered data saved to: {outputPath}")

Filtered data saved to: /content/Multivariate_CNA_filtered_by_hallmark.csv


In [None]:
from google.colab import files
files.download(outputPath)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>