In [20]:
import h5py
import pandas as pd

# Open the .gctx file
gctx_file = "../data/raw/level3_beta_trt_cp_n1805898x12328.gctx"

with h5py.File(gctx_file, "r") as f:
    # Load data matrix (first 10000 rows for testing)
    data = f["/0/DATA/0/matrix"]
    subset = data[:10000]

    # Load ROW and COL metadata
    row_ids = f["/0/META/ROW/id"][:].astype(int)  # Convert bytes to strings
    col_ids = f["/0/META/COL/id"][:].astype(str)
    f.close()

# Create a DataFrame with proper row and column labels with implicit transpose
df = pd.DataFrame(subset, index=col_ids[:10000], columns=row_ids)

In [14]:
genes = pd.read_csv("../data/raw/geneinfo_beta.txt", sep="\t")
# Step 1: Create a mapping from gene_id to gene_symbol
gene_mapping = genes.set_index("gene_id")["gene_symbol"].to_dict()

df = df.rename(columns=gene_mapping)

In [17]:
df.shape

(10000, 12328)

In [15]:
pert_info = pd.read_csv("../data/raw/compound_pert_info.tsv", sep="\t")
pert_info

Unnamed: 0,sample_id,bead_batch,nearest_dose,pert_dose,pert_dose_unit,pert_idose,pert_time,pert_itime,pert_time_unit,cell_mfc_name,...,pert_id,pert_type,cell_iname,qc_pass,dyn_range,inv_level_10,build_name,failure_mode,project_code,cmap_name
0,ABY001_A375_XH_X1_B15:A13,b15,10.00,10.0,uM,10 uM,24.0,24 h,h,A375,...,BRD-K66175015,trt_cp,A375,1.0,11.86880,3347.0,,,ABY,afatinib
1,ABY001_A375_XH_X1_B15:A14,b15,10.00,10.0,uM,10 uM,24.0,24 h,h,A375,...,BRD-K70401845,trt_cp,A375,1.0,14.91800,3185.0,,,ABY,erlotinib
2,ABY001_A375_XH_X1_B15:A15,b15,10.00,10.0,uM,10 uM,24.0,24 h,h,A375,...,BRD-K85606544,trt_cp,A375,1.0,10.83770,2471.0,,,ABY,neratinib
3,ABY001_A375_XH_X1_B15:A16,b15,10.00,10.0,uM,10 uM,24.0,24 h,h,A375,...,BRD-K19687926,trt_cp,A375,1.0,14.30800,3577.0,,,ABY,lapatinib
4,ABY001_A375_XH_X1_B15:A17,b15,10.00,10.0,uM,10 uM,24.0,24 h,h,A375,...,BRD-K66175015,trt_cp,A375,1.0,9.27318,2800.5,,,ABY,afatinib
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1312165,TSAI002_NPC-8_XH_X1_B18:J12,b18,10.00,10.0,uM,10 uM,-666.0,,h,NPC-8,...,949,trt_cp,NPC,1.0,17.40000,3741.0,,,TSAI,949
1312166,TSAI002_NPC-8_XH_X1_B18:J14,b18,4.00,5.0,uM,4 uM,-666.0,,h,NPC-8,...,SRT3657,trt_cp,NPC,1.0,15.25750,3555.0,,,TSAI,SRT-3657
1312167,TSAI002_NPC-8_XH_X1_B18:J16,b18,2.22,2.0,uM,2.22 uM,-666.0,,h,NPC-8,...,COMPE,trt_cp,NPC,1.0,12.89100,4318.5,,,TSAI,compe
1312168,TSAI002_NPC-8_XH_X1_B18:J20,b18,10.00,10.0,uM,10 uM,-666.0,,h,NPC-8,...,C646,trt_cp,NPC,1.0,16.06860,3511.0,,,TSAI,C646


In [9]:
df.head()

Unnamed: 0,NAT2,ADA,CDH2,AKT3,MED6,NAALAD2,NAALADL1,ACOT8,ABI1,GNPDA1,...,REC8,HNRNPDL,DMTF1,PPP4R1,CDH1,SLC12A6,PTBP3,KCNE2,DGCR2,SCO2
ABY001_A375_XH_X1_B15:A13,5.7191,7.4924,6.56405,9.1052,7.451,4.5935,5.31835,7.7014,6.1691,12.7639,...,5.2306,10.0028,7.18005,7.6505,1.1633,4.8897,5.5534,5.7713,6.1668,9.1689
ABY001_A375_XH_X1_B15:A14,5.3946,7.8669,7.6762,9.2727,8.6112,5.9464,5.37035,7.3337,5.8269,9.9611,...,5.77195,10.3324,6.85075,8.0699,1.8844,4.3627,5.8257,5.3994,5.902,7.61165
ABY001_A375_XH_X1_B15:A15,4.7285,7.0399,4.9337,8.4097,8.7282,5.2363,5.3611,6.88025,5.9476,12.2874,...,5.1743,11.333,7.2998,7.9811,0.9265,5.0362,5.7679,5.583,7.1669,9.1458
ABY001_A375_XH_X1_B15:A16,5.529,7.3355,7.82995,8.7718,8.5942,5.38505,5.2257,7.3696,5.2347,10.512,...,4.59995,11.5465,6.9559,7.7559,2.5469,4.0316,5.636,5.1261,6.4808,8.3281
ABY001_A375_XH_X1_B15:A17,6.611,6.5117,4.89115,9.2922,8.6256,4.6232,5.2491,6.0883,5.67,12.1923,...,5.0566,10.9467,6.4734,7.8914,3.2102,4.4431,4.8528,5.6388,6.8784,10.4999


In [None]:
# from cmapPy.pandasGEXpress.parse import parse

# # Specify the first 1000 row indices
# first_1000_rows = list(range(1000))

# # Parse the file with only the first 1000 rows
# gctoo = parse(
#     "../data/raw/level3_beta_trt_cp_n1805898x12328.gctx",
#     convert_neg_666=True,
#     ridx=first_1000_rows,
# )

In [1]:
import h5py
import pandas as pd


def process_gctx_in_chunks(
    gctx_file,
    valid_sample_ids,
    chunk_size=10000,
    output_file="../data/raw/filtered_data_matrix.tsv",
):
    """
    Processes a .gctx file in chunks, filters rows based on valid sample IDs, and saves the result.

    Args:
        gctx_file (str): Path to the .gctx file.
        valid_sample_ids (set): Set of valid sample IDs to filter.
        chunk_size (int): Number of rows to process per chunk.
        output_file (str): Path to save the filtered data matrix.
    """
    with h5py.File(gctx_file, "r") as f:
        # Access the data matrix and metadata
        data = f["/0/DATA/0/matrix"]
        col_ids = f["/0/META/COL/id"][:].astype(str)  # Sample IDs
        row_ids = f["/0/META/ROW/id"][:].astype(int)  # Gene IDs

        # Initialize output
        first_chunk = True

        # Process in chunks
        for i in range(0, data.shape[0], chunk_size):
            # Load the chunk
            chunk = data[i : i + chunk_size]
            chunk_sample_ids = col_ids[i : i + chunk_size]

            # Create a DataFrame for the chunk
            df_chunk = pd.DataFrame(chunk, index=chunk_sample_ids, columns=row_ids)

            # Filter rows based on valid_sample_ids
            df_chunk = df_chunk.loc[df_chunk.index.intersection(valid_sample_ids)]

            # Append filtered data to the output file
            df_chunk.to_csv(output_file, sep="\t", mode="a", header=first_chunk)
            first_chunk = False  # Add header only for the first chunk

        print(f"Filtered data matrix saved to {output_file}.")

# Load the valid sample IDs
valid_pert_info = pd.read_csv("../data/raw/compound_pert_info.tsv", sep="\t")
valid_sample_ids = set(valid_pert_info["sample_id"])

# Path to your .gctx file
gctx_file = "../data/raw/level3_beta_trt_cp_n1805898x12328.gctx"

# # Process and filter in chunks
# process_gctx_in_chunks(
#     gctx_file=gctx_file,
#     valid_sample_ids=valid_sample_ids,
#     chunk_size=10000,  # Adjust based on available memory
#     output_file="../data/raw/filtered_data_matrix.tsv",
# )

Filtered data matrix saved to ../data/raw/filtered_data_matrix.tsv.


In [4]:
import pandas as pd

# Load only the first 5 rows to check headers
sample_df = pd.read_csv("../data/raw/filtered_data_matrix.tsv", sep="\t", nrows=1000)

sample_df

Unnamed: 0.1,Unnamed: 0,10,100,1000,10000,10001,10003,10004,10005,10006,...,9985,9987,9988,9989,999,9990,9991,9992,9993,9997
0,ABY001_A375_XH_X1_B15:A13,5.7191,7.4924,6.56405,9.1052,7.451,4.5935,5.31835,7.7014,6.1691,...,5.2306,10.0028,7.18005,7.6505,1.1633,4.8897,5.5534,5.7713,6.1668,9.1689
1,ABY001_A375_XH_X1_B15:A14,5.3946,7.8669,7.6762,9.2727,8.6112,5.9464,5.37035,7.3337,5.8269,...,5.77195,10.3324,6.85075,8.0699,1.8844,4.3627,5.8257,5.3994,5.902,7.61165
2,ABY001_A375_XH_X1_B15:A15,4.7285,7.0399,4.9337,8.4097,8.7282,5.2363,5.3611,6.88025,5.9476,...,5.1743,11.333,7.2998,7.9811,0.9265,5.0362,5.7679,5.583,7.1669,9.1458
3,ABY001_A375_XH_X1_B15:A16,5.529,7.3355,7.82995,8.7718,8.5942,5.38505,5.2257,7.3696,5.2347,...,4.59995,11.5465,6.9559,7.7559,2.5469,4.0316,5.636,5.1261,6.4808,8.3281
4,ABY001_A375_XH_X1_B15:A17,6.611,6.5117,4.89115,9.2922,8.6256,4.6232,5.2491,6.0883,5.67,...,5.0566,10.9467,6.4734,7.8914,3.2102,4.4431,4.8528,5.6388,6.8784,10.4999
