In [1]:
# 1) Paths + checks

import numpy as np
import pandas as pd
import h5py
import scipy.sparse as sp
import anndata as ad
from pathlib import Path

DATA_DIR = Path("../data/raw")

EXPR_PATH = DATA_DIR / "SKCM_GSE134388_aPD1_expression.h5"
META_PATH = DATA_DIR / "SKCM_GSE134388_aPD1_CellMetainfo_table.tsv"

assert EXPR_PATH.exists(), f"Expression file not found: {EXPR_PATH}"
assert META_PATH.exists(), f"Metadata file not found: {META_PATH}"

print("Files located successfully")
print("EXPR:", EXPR_PATH.name)
print("META:", META_PATH.name)


Files located successfully
EXPR: SKCM_GSE134388_aPD1_expression.h5
META: SKCM_GSE134388_aPD1_CellMetainfo_table.tsv


In [None]:
# Load expression from matrix/and build sparse CSR

import scipy.sparse as sp

with h5py.File(EXPR_PATH, "r") as f:
    barcodes = f["matrix"]["barcodes"][:].astype(str)

    feats = f["matrix"]["features"]
    if isinstance(feats, h5py.Group):
        if "name" in feats:
            genes = feats["name"][:].astype(str)
        elif "id" in feats:
            genes = feats["id"][:].astype(str)
        else:
            raise KeyError("No gene names found in matrix/features")
    else:
        genes = feats[:].astype(str)

    data = f["matrix"]["data"][:]
    indices = f["matrix"]["indices"][:]
    indptr = f["matrix"]["indptr"][:]

n_cells = len(barcodes)
n_genes = len(genes)

print("n_cells:", n_cells, "n_genes:", n_genes, "indptr_len:", len(indptr))

# Build as CSC: (genes x cells)
X = sp.csc_matrix((data, indices, indptr), shape=(n_genes, n_cells))

# Convert to AnnData-friendly orientation: (cells x genes)
X = X.T.tocsr()

print("X final:", X.shape)


n_cells: 3632 n_genes: 14705 indptr_len: 3633
X final: (3632, 14705)


In [3]:
# Load metadata and align to the same cells


import pandas as pd
import numpy as np
import anndata as ad
from pathlib import Path

# Load metadata
meta = pd.read_csv(META_PATH, sep="\t")

# Set the correct cell-id column (based on your file it is "Cell")
meta["Cell"] = meta["Cell"].astype(str)
meta = meta.set_index("Cell")

# Make sure barcodes are strings
barcodes = barcodes.astype(str)

# Keep only common cells, in the SAME order as barcodes
common = np.intersect1d(barcodes, meta.index)
print("common cells:", len(common))

meta = meta.loc[barcodes]  # this will error if any barcode missing
# If you suspect missing barcodes, use:
# meta = meta.reindex(barcodes)
# print(meta.isna().any().sum(), "columns have missing values after reindex")


common cells: 3632


In [4]:
# Build AnnData correctly (X is already cells × genes)

var = pd.DataFrame(index=pd.Index(genes, name="gene"))

adata = ad.AnnData(
    X=X,          # (cells x genes)
    obs=meta,     # rows = cells
    var=var       # rows = genes
)

print(adata)
print("obs rows:", adata.n_obs, "var cols:", adata.n_vars)



AnnData object with n_obs × n_vars = 3632 × 14705
    obs: 'UMAP_1', 'UMAP_2', 'Cluster', 'Celltype (malignancy)', 'Celltype (major-lineage)', 'Celltype (minor-lineage)', 'Patient', 'Sample', 'Tissue'
obs rows: 3632 var cols: 14705


In [5]:
# Save to data/processed/

out = Path("../data/processed/SKCM_GSE134388_aPD1_adata_base.h5ad")
out.parent.mkdir(parents=True, exist_ok=True)
adata.write_h5ad(out)
print("Saved:", out)


Saved: ..\data\processed\SKCM_GSE134388_aPD1_adata_base.h5ad
