Here, find the intersection of cells between current neuron subset and neuron subset on GEO for revisions. Update the neuronal count matrices with soupX-corrected counts. Filter out SOLO-detected doublets. This yields a neuron-specific dataset for downstream training, DE testing, and pathway enrichment.

In [None]:
import os
import sys
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [None]:
data_dir = '/path/to/data/geo'

metadata_file = os.path.join(data_dir, 'GSE281850_cell_metadata.tsv')
matrix_file = os.path.join(data_dir, 'GSE281850_expression_matrix.tsv')

In [None]:
expression_df = pd.read_csv(matrix_file, sep='\t', index_col=0)
metadata_df = pd.read_csv(metadata_file, sep='\t', index_col=0)

In [None]:
adata = sc.AnnData(X=expression_df.values)
adata.obs = metadata_df
adata.var_names = expression_df.columns
adata.obs_names = expression_df.index

In [None]:
adata.obs

In [None]:
adata_clean_path = '/path/to/data/h5ad/01_adata_all.h5ad'
adata_clean = sc.read_h5ad(adata_clean_path)
adata_clean

In [None]:
adata_clean.obs

# Mapping

In [None]:
def transfer_annotations(adata, adata_clean, suffix_map, columns_to_transfer):
    import pandas as pd

    # Helper: extract core barcode
    def extract_core_barcode(barcode):
        parts = barcode.split('-')
        return '-'.join(parts[:-1])

    # Build adata_neurons_df (all cells, or subset beforehand if you want neurons only!)
    adata_df = pd.DataFrame({
        'barcode_adata': adata.obs_names,
        'core_barcode': adata.obs_names.map(extract_core_barcode),
        'suffix_adata': adata.obs_names.map(lambda x: x.split('-')[-1]),
        'expected_clean_suffix': adata.obs_names.map(lambda x: suffix_map.get(x.split('-')[-1], None))
    })

    # Add columns to transfer
    for col in columns_to_transfer:
        adata_df[col] = adata.obs[col].values

    # Build adata_clean_df
    adata_clean_df = pd.DataFrame({
        'barcode_adata_clean': adata_clean.obs_names,
        'core_barcode': adata_clean.obs_names.map(extract_core_barcode),
        'suffix_adata_clean': adata_clean.obs_names.map(lambda x: x.split('-')[-1])
    })

    # Merge
    merged = pd.merge(
        adata_clean_df,
        adata_df,
        left_on=['core_barcode', 'suffix_adata_clean'],
        right_on=['core_barcode', 'expected_clean_suffix'],
        how='left'
    )

    # Write columns to adata_clean.obs
    for col in columns_to_transfer:
        adata_clean.obs[col] = merged.set_index('barcode_adata_clean').loc[adata_clean.obs_names, col]

    # Optional: report
    print(f"Transferred columns: {columns_to_transfer}")
    for col in columns_to_transfer:
        matched_count = adata_clean.obs[col].notna().sum()
        print(f" {col}: {matched_count} / {adata_clean.shape[0]} cells have value")

    return adata_clean  # return updated adata_clean (optional)

In [None]:
# Define suffix mapping
suffix_map = {
    'A': '1',
    'B': '3',
    'C': '5',
    'D': '6',
    'E': '7',
    'F': '8'
}

# Define columns to transfer
columns_to_transfer = [
    'neuron_recluster_id',
    'group',
    'cluster',
    'cell_type',
    'injury',
    'AAV',
    'hemisphere'
]

adata_clean = transfer_annotations(adata, adata_clean, suffix_map, columns_to_transfer)

In [None]:
adata_clean.obs.neuron_recluster_id.value_counts()

In [None]:
adata_clean_neurons = adata_clean[~adata_clean.obs['neuron_recluster_id'].isna()].copy()

print(f"Neuron cells in adata_clean_neurons: {adata_clean_neurons.shape[0]}")

In [None]:
adata_clean_neurons

In [None]:
adata_neurons = adata[~adata.obs['neuron_recluster_id'].isna()].copy()
print(f"Number of neurons in adata_neurons: {adata_neurons.shape[0]}")

In [None]:
adata_clean_neurons = adata_clean_neurons[adata_clean_neurons.obs['solo_prediction'] == 'singlet'].copy()
adata_clean_neurons

# Export

In [None]:
output_dir = '/path/to/data/h5ad'
adata_clean_neurons.write_h5ad(os.path.join(output_dir, '03_neurons-clean.h5ad'), compression='gzip')