In [10]:
import scanpy as sc
import anndata
import glob
import gc
import os
from scipy import sparse
import pandas as pd
import time

In [5]:

def is_valid_h5ad(filename):
    """Check if the file contains the expected 'obs' group."""
    try:
        with h5py.File(filename, 'r') as f:
            return "obs" in f.keys()
    except Exception as e:
        print(f"Error opening {filename}: {e}")
        return False

# List all h5ad files in the current folder
files = glob.glob("*.h5ad")

print(f"Found {len(files)} h5ad files.")

Found 7 h5ad files.


In [11]:
# files= files[1:]
files = ['GSE123025_GSM_preprocessed_with_metadata.h5ad',
 'GSE121654_GSM_preprocessed_with_metadata.h5ad',
 'GSE98969_GSM_preprocessed_with_metadata.h5ad',
         'GSE98971_GSM_preprocessed_with_metadata.h5ad']

In [12]:
def get_union_vars(files):
    """Compute the union of all variable names across files without loading full data."""
    union_vars = set()
    for f in files:
        try:
            # Open in backed mode to only read metadata
            adata = sc.read_h5ad(f, backed='r')
            print(adata.var_names)
            union_vars.update(adata.var_names)
            
            adata.file.close()  # Close the file handle
        except Exception as e:
            print(f"Error processing {f} for union vars: {e}")
    return list(union_vars)

def reindex_adata(adata, union_vars):
    """
    Reindex the AnnData object so that its columns match the union of all variables.
    For missing variables, add columns filled with zeros.
    """
    current_vars = adata.var_names.tolist()
    missing = set(union_vars) - set(current_vars)
    if missing:
        n_missing = len(missing)
        # Create a sparse matrix of zeros for the missing variables.
        zeros = sparse.csr_matrix((adata.n_obs, n_missing))
        # Create an AnnData object for missing columns.
        missing_adata = anndata.AnnData(X=zeros, obs=adata.obs.copy(), 
                                        var=pd.DataFrame(index=list(missing)))
        # Concatenate along columns (axis=1) with fill_value=0.
        adata = anndata.concat([adata, missing_adata], axis=1, join='outer', fill_value=0)
    # Now sort columns so that they follow the union_vars order.
    adata = adata[:, union_vars].copy()
    return adata


In [13]:
# Compute the union of variables from all files.
union_vars = get_union_vars(files)
print(f"Union of variables has {len(union_vars)} elements.")

Index(['0610005C13Rik', '0610007C21Rik', '0610007L01Rik', '0610007N19Rik',
       '0610007P08Rik', '0610007P14Rik', '0610007P22Rik', '0610008F07Rik',
       '0610009B14Rik', '0610009B22Rik',
       ...
       'Zxda', 'Zxdb', 'Zxdc', 'Zyg11a', 'Zyg11b', 'Zyx', 'Zzef1', 'Zzz3', 'a',
       'l7Rn6'],
      dtype='object', length=23429)
Index(['X0610007P14Rik', 'X0610009B22Rik', 'X0610009L18Rik', 'X0610009O20Rik',
       'X0610010F05Rik', 'X0610010K14Rik', 'X0610011F06Rik', 'X0610012G03Rik',
       'X0610025J13Rik', 'X0610030E20Rik',
       ...
       'Gm15882', 'Gm26671', 'Gm27196', 'Gm3289', 'Gm42485', 'Gm9857',
       'Olfr1143', 'Pkhd1l1', 'Slc22a29', 'Tex19.2'],
      dtype='object', name='GENE', length=20758)
Index(['0610005C13Rik', '0610007C21Rik', '0610007L01Rik', '0610007P08Rik',
       '0610007P14Rik', '0610007P22Rik', '0610008F07Rik', '0610009B14Rik',
       '0610009B22Rik', '0610009D07Rik',
       ...
       'snoU83B', 'snoU85', 'snoU89', 'snoU90', 'snoU97', 'snoZ159', 'snoZ178

In [6]:
# Iteratively concatenate files into one combined AnnData.
combined_adata = None
for f in files:
    try:
        print(f"Processing file: {f}")
        # Open in backed mode and then load into memory.
        adata = sc.read_h5ad(f, backed='r').to_memory()
        # Ensure observation names are unique.
        adata.obs_names_make_unique()
        # Reindex so that this AnnData has the full set of variables.
        adata = reindex_adata(adata, union_vars)
        
        # If this is the first file, initialize the combined object.
    #     if combined_adata is None:
    #         combined_adata = adata
    #     else:
    #         # Concatenate the new AnnData with the existing combined AnnData.
    #         combined_adata = anndata.concat([combined_adata, adata],
    #                                         join='outer', fill_value=0)
    #     # Clean up to free memory.
    #     del adata
    #     gc.collect()
    except Exception as e:
        print(f"Error processing {f}: {e}")

Processing file: GSM3442052_P5_female_percoll_3.dge.txt_GSM_preprocessed_with_metadata.h5ad


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Processing file: GSM1940063_Agilent_252665515669_S001_GE2_1105_Oct12_1_2.txt_GSM_preprocessed_with_metadata.h5ad
Error processing GSM1940063_Agilent_252665515669_S001_GE2_1105_Oct12_1_2.txt_GSM_preprocessed_with_metadata.h5ad: "Unable to open object (object 'obs' doesn't exist)"
Processing file: GSM2629424_AB2340.txt.gz_GSM_preprocessed_with_metadata.h5ad
Processing file: GSE121654_GSM_preprocessed_with_metadata.h5ad


  utils.warn_names_duplicates("obs")


Processing file: GSM1940080_GSM_preprocessed_with_metadata.h5ad
Error processing GSM1940080_GSM_preprocessed_with_metadata.h5ad: "Unable to open object (object 'obs' doesn't exist)"
Processing file: GSE98971_GSM_preprocessed_with_metadata.h5ad


  utils.warn_names_duplicates("obs")


Processing file: GSM3442011_GSM_preprocessed_with_metadata.h5ad


  utils.warn_names_duplicates("obs")


Processing file: GSM2629424_GSM_preprocessed_with_metadata.h5ad


  utils.warn_names_duplicates("obs")


In [7]:
if combined_adata is None:
    raise ValueError("No valid h5ad files were processed.")


In [5]:
print("Final combined shape:", combined_adata.shape)
# Save the final combined AnnData to disk.
combined_adata.write("combined_data.h5ad")

NameError: name 'combined_adata' is not defined

In [14]:
def get_common_obs_columns(files):
    """Compute the intersection of obs columns across all files without loading full data."""
    common_obs = None
    for f in files:
        try:
            # Open in backed mode to quickly access metadata
            adata = sc.read_h5ad(f, backed='r')
            # Get the obs columns from this file
            obs_cols = set(adata.obs.columns)
            if common_obs is None:
                common_obs = obs_cols
            else:
                common_obs = common_obs.intersection(obs_cols)
            adata.file.close()  # Close file handle
        except Exception as e:
            print(f"Error processing {f} for obs columns: {e}")
    return list(common_obs) if common_obs is not None else []

In [15]:
import numpy as np

def reindex_obs(adata, common_obs_cols):
    """
    Reindex the AnnData object's .obs DataFrame so that it contains exactly the columns in common_obs_cols.
    For any missing column, create that column filled with NaN values.
    """
    # Add missing columns with NaN values.
    for col in common_obs_cols:
        if col not in adata.obs.columns:
            adata.obs[col] = np.nan

    # Subset and reorder the obs DataFrame.
    adata.obs = adata.obs[common_obs_cols].copy()
    
    return adata


In [16]:
# Compute the common obs columns across files.
common_obs_cols = get_common_obs_columns(files)
print("Common obs columns:", common_obs_cols)

Common obs columns: ['DATASTORE region', 'BioProject', 'Consent', 'Bytes', 'DATASTORE filetype', 'Center Name', 'AvgSpotLen', 'ReleaseDate', 'Bases', 'Experiment', 'BioSample', 'DATASTORE provider', 'Organism', 'GEO_Accession (exp)', 'LibrarySource', 'strain', 'Assay Type', 'create_date', 'source_name', 'Instrument', 'version', 'LibraryLayout', 'LibrarySelection', 'Platform', 'SRA Study']


In [38]:
important_cols = [
    'Bytes',
    'Bases',
    'strain',
    'LibrarySelection',
    'LibrarySource',
    'LibraryLayout',
    'Instrument',
    'source_name',
    'Assay Type',
    'AvgSpotLen'
]
def keep_important_obs_cols(adata, important_cols):
    """
    Subset the AnnData object's .obs to only the important columns.
    Missing columns will be ignored.
    """
    # Find which columns in your list actually exist in the data
    existing_cols = [col for col in important_cols if col in adata.obs.columns]

    # For columns that don't exist, you may want to add them as NaN
    missing_cols = [col for col in important_cols if col not in adata.obs.columns]
    for col in missing_cols:
        adata.obs[col] = np.nan

    # Now subset and reorder .obs to the exact order in important_cols
    adata.obs = adata.obs[important_cols].copy()
    return adata


In [39]:
# Process files in small chunks and write intermediate combined AnnData files to disk.
chunk_size = 1  # adjust as needed for your memory capacity
intermediate_dir = "intermediate_chunks"
os.makedirs(intermediate_dir, exist_ok=True)
intermediate_files = []

In [40]:
for i in range(0, len(files), chunk_size):
    combined_chunk = None
    chunk_files = files[i:i+chunk_size]
    print(f"Processing chunk {i // chunk_size + 1}: {chunk_files}")
    for f in chunk_files:
        try:
            # Open in backed mode and load into memory.
            adata = sc.read_h5ad(f, backed='r').to_memory()
            adata.obs_names_make_unique()
            # Reindex the .obs to have only the common obs columns.
            # adata = reindex_obs(adata, common_obs_cols)
            adata = keep_important_obs_cols(adata, important_cols)
            # If this is the first file in the chunk, initialize; else, concatenate.
            if combined_chunk is None:
                combined_chunk = adata
            else:
                combined_chunk = anndata.concat([combined_chunk, adata],
                                                join='outer', fill_value=0)
            del adata
            gc.collect()
            time.sleep(2)  # short delay to alleviate I/O pressure
        except Exception as e:
            print(f"Error processing {f}: {e}")
    if combined_chunk is not None:
        chunk_file = os.path.join(intermediate_dir, f"chunk_{i // chunk_size}.h5ad")
        combined_chunk.write(chunk_file)
        intermediate_files.append(chunk_file)
        print(f"Written intermediate chunk to {chunk_file}")
        del combined_chunk
        gc.collect()

Processing chunk 1: ['intermediate_chunks/chunk_3.h5ad']
Written intermediate chunk to intermediate_chunks/chunk_0.h5ad
Processing chunk 2: ['intermediate_chunks/chunk_2.h5ad']
Written intermediate chunk to intermediate_chunks/chunk_1.h5ad


In [42]:
files = glob.glob("intermediate_chunks/chunk_*.h5ad")
for f in files:
    a1 = sc.read(f)
    print(a1)

AnnData object with n_obs × n_vars = 37248 × 34016
    obs: 'Bytes', 'Bases', 'strain', 'LibrarySelection', 'LibrarySource', 'LibraryLayout', 'Instrument', 'source_name', 'Assay Type', 'AvgSpotLen'
AnnData object with n_obs × n_vars = 37248 × 34016
    obs: 'Bytes', 'Bases', 'strain', 'LibrarySelection', 'LibrarySource', 'LibraryLayout', 'Instrument', 'source_name', 'Assay Type', 'AvgSpotLen'
AnnData object with n_obs × n_vars = 37248 × 34016
    obs: 'Bytes', 'Bases', 'strain', 'LibrarySelection', 'LibrarySource', 'LibraryLayout', 'Instrument', 'source_name', 'Assay Type', 'AvgSpotLen'
AnnData object with n_obs × n_vars = 37248 × 34016
    obs: 'Bytes', 'Bases', 'strain', 'LibrarySelection', 'LibrarySource', 'LibraryLayout', 'Instrument', 'source_name', 'Assay Type', 'AvgSpotLen'


In [41]:
import scanpy as sc
import pandas as pd
import anndata
import zarr
import glob
import gc

In [24]:
files = glob.glob("intermediate_chunks/chunk_*.h5ad")

zarr_paths = []
for fn in files:
    # Read the dataset in backed mode to avoid loading the entire file into memory
    ad = sc.read_h5ad(fn, backed="r")
    zpath = fn.replace(".h5ad", ".zarr")
    
    # Write to Zarr format with specified chunk sizes (tweak chunk sizes as needed)
    ad.write_zarr(zpath, chunks=(500, 500))
    
    # Force the AnnData object to load fully into memory, then release it
    ad._init_as_actual()  # Call without any parameters
    del ad
    gc.collect()
    
    zarr_paths.append(zpath)

In [26]:
import zarr, pandas as pd, numpy as np, glob, os, gc
from collections import OrderedDict
# 1) Locate your per‑chunk Zarr stores
zarr_paths = sorted(glob.glob("intermediate_chunks/chunk_*.zarr"))

# 2) Build union of var names
all_vars = []
for zp in zarr_paths:
    grp = zarr.open_group(zp, mode="r")
    # List the single key under var
    keys = list(grp["var"].array_keys())
    if len(keys) != 1:
        raise ValueError(f"Expected exactly one var index in {zp}, found {keys}")
    idx_key = keys[0]
    raw = grp["var"][idx_key][:]
    # Decode bytes to strings if necessary
    vars_chunk = [v.decode("utf-8") if isinstance(v, bytes) else v for v in raw]
    all_vars.extend(vars_chunk)

# Build union preserving first‐seen order
union_vars = list(OrderedDict.fromkeys(all_vars))
var2idx = {v: i for i, v in enumerate(union_vars)}
total_vars = len(union_vars)

# Step 3: Gather metadata including each chunk's gene list
metas = []
for zp in zarr_paths:
    grp = zarr.open_group(zp, mode="r")
    X = grp["X"]
    var_keys = list(grp["var"].array_keys())
    idx_key = var_keys[0]
    raw = grp["var"][idx_key][:]
    vars_chunk = [v.decode("utf-8") if isinstance(v, bytes) else v for v in raw]

    metas.append({
        "path": zp,
        "n_obs": X.shape[0],
        "chunks": X.chunks,       # (row_chunk, col_chunk)
        "dtype": X.dtype,
        "compressor": X.compressor,
        "vars": vars_chunk,
    })

total_obs = sum(m["n_obs"] for m in metas)

# Step 4: Create merged Zarr store for X
out_path = "merged_dataset1.zarr"
if os.path.exists(out_path):
    os.remove(out_path)
out = zarr.open_group(out_path, mode="w")
out.create_dataset(
    "X",
    shape=(total_obs, total_vars),
    chunks=(metas[0]["chunks"][0], min(total_vars, 1000)),
    dtype=metas[0]["dtype"],
    compressor=metas[0]["compressor"],
)

# Step 5: Stream‐copy each chunk’s X into the global Zarr, aligning columns
obs_dfs = []
offset = 0

for m in metas:
    grp = zarr.open_group(m["path"], mode="r")
    X = grp["X"]
    cols_idx = [var2idx[v] for v in m["vars"]]

    # collect this chunk's obs
    obs_cols = {c: grp["obs"][c][:] for c in grp["obs"].array_keys()}
    obs_dfs.append(pd.DataFrame(obs_cols))

    # copy X in row‐chunks, slicing columns
    row_chunk = m["chunks"][0]
    for start in range(0, m["n_obs"], row_chunk):
        stop = min(start + row_chunk, m["n_obs"])
        block = X[start:stop, :]  # shape (r, original_vars)
    
        # Assign each local column j to the global column cols_idx[j]
        for j, gcol in enumerate(cols_idx):
            out["X"][offset + start : offset + stop, gcol] = block[:, j]
    offset += m["n_obs"]
    gc.collect()

# Step 6: Write obs and var out as CSVs (outside the Zarr store)
obs_df = pd.concat(obs_dfs, axis=0, ignore_index=True)
obs_df.to_csv("merged_dataset.zarr/merged_dataset.obs.csv", index=False)

pd.DataFrame(index=union_vars).to_csv("merged_dataset.zarr/merged_dataset.var.csv")

print("✅ Done!")
print("  • merged X at: merged_dataset.zarr")
print("  • obs table at: merged_dataset.obs.csv")
print("  • var list at: merged_dataset.var.csv")


✅ Done!
  • merged X at: merged_dataset.zarr
  • obs table at: merged_dataset.obs.csv
  • var list at: merged_dataset.var.csv


In [27]:
import anndata
import zarr
import pandas as pd

# 1) Load obs and var
obs = pd.read_csv("merged_dataset.zarr/merged_dataset.obs.csv", index_col=None)
var = pd.read_csv("merged_dataset.zarr/merged_dataset.var.csv", index_col=0)

# 2) Load X fully into memory (this will allocate a dense array!)
X = zarr.open("merged_dataset1.zarr", mode="r")["X"][:]

# 3) Construct AnnData
adata = anndata.AnnData(X=X, obs=obs, var=var)
print(adata)


AnnData object with n_obs × n_vars = 74496 × 34016
    obs: 'AvgSpotLen', 'Bases', 'Bytes', '_index'




In [28]:
if '_index' in adata.obs.columns:
    adata.obs.rename(columns={'_index': 'original_index'}, inplace=True)


In [29]:
# 4) (Optional) write to h5ad for future use
adata.write_h5ad("merged_dataset1.h5ad")
