# Data Exploration
Data Source: [Allen Brain Atlas](https://portal.brain-map.org/)

Instructions for downloading the data can be found here: https://alleninstitute.github.io/abc_atlas_access/notebooks/getting_started.html

In [1]:
import scanpy as sc
import scvi
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from pathlib import Path
from abc_atlas_access.abc_atlas_cache.abc_project_cache import AbcProjectCache

  from .autonotebook import tqdm as notebook_tqdm
  doc = func(self, args[0].__doc__, *args[1:], **kwargs)
  doc = func(self, args[0].__doc__, *args[1:], **kwargs)


In [2]:
download_base = Path("/home/jantine/Data/AllenBrain/abc_atlas")
abc_cache = AbcProjectCache.from_cache_dir(download_base)

abc_cache.current_manifest

In [3]:
abc_cache.list_manifest_file_names

['releases/20230630/manifest.json',
 'releases/20230830/manifest.json',
 'releases/20231215/manifest.json',
 'releases/20240330/manifest.json',
 'releases/20240831/manifest.json',
 'releases/20241115/manifest.json',
 'releases/20241130/manifest.json',
 'releases/20250131/manifest.json',
 'releases/20250331/manifest.json']

In [4]:
# Return to the latest manifest
abc_cache.load_latest_manifest()
print("after latest manifest loaded:", abc_cache.current_manifest)

after latest manifest loaded: releases/20250331/manifest.json


In [5]:
abc_cache.list_directories  # I need WMB-10Xv3

['ASAP-PMDBS-10X',
 'ASAP-PMDBS-taxonomy',
 'Allen-CCF-2020',
 'MERFISH-C57BL6J-638850',
 'MERFISH-C57BL6J-638850-CCF',
 'MERFISH-C57BL6J-638850-imputed',
 'MERFISH-C57BL6J-638850-sections',
 'SEAAD',
 'SEAAD-taxonomy',
 'WHB-10Xv3',
 'WHB-taxonomy',
 'WMB-10X',
 'WMB-10XMulti',
 'WMB-10Xv2',
 'WMB-10Xv3',
 'WMB-neighborhoods',
 'WMB-taxonomy',
 'Zeng-Aging-Mouse-10Xv3',
 'Zeng-Aging-Mouse-WMB-taxonomy',
 'Zhuang-ABCA-1',
 'Zhuang-ABCA-1-CCF',
 'Zhuang-ABCA-2',
 'Zhuang-ABCA-2-CCF',
 'Zhuang-ABCA-3',
 'Zhuang-ABCA-3-CCF',
 'Zhuang-ABCA-4',
 'Zhuang-ABCA-4-CCF']

In [6]:
abc_cache.list_data_files('WMB-10Xv3')

['WMB-10Xv3-CB/log2',
 'WMB-10Xv3-CB/raw',
 'WMB-10Xv3-CTXsp/log2',
 'WMB-10Xv3-CTXsp/raw',
 'WMB-10Xv3-HPF/log2',
 'WMB-10Xv3-HPF/raw',
 'WMB-10Xv3-HY/log2',
 'WMB-10Xv3-HY/raw',
 'WMB-10Xv3-Isocortex-1/log2',
 'WMB-10Xv3-Isocortex-1/raw',
 'WMB-10Xv3-Isocortex-2/log2',
 'WMB-10Xv3-Isocortex-2/raw',
 'WMB-10Xv3-MB/log2',
 'WMB-10Xv3-MB/raw',
 'WMB-10Xv3-MY/log2',
 'WMB-10Xv3-MY/raw',
 'WMB-10Xv3-OLF/log2',
 'WMB-10Xv3-OLF/raw',
 'WMB-10Xv3-P/log2',
 'WMB-10Xv3-P/raw',
 'WMB-10Xv3-PAL/log2',
 'WMB-10Xv3-PAL/raw',
 'WMB-10Xv3-STR/log2',
 'WMB-10Xv3-STR/raw',
 'WMB-10Xv3-TH/log2',
 'WMB-10Xv3-TH/raw']

In [7]:
# Check amount of total data 
abc_cache.get_directory_data_size('WMB-10Xv3')

'176.41 GB'

In [9]:
# List metadata files
abc_cache.list_metadata_files("WMB-taxonomy")

['cluster',
 'cluster_annotation_term',
 'cluster_annotation_term_set',
 'cluster_annotation_term_with_counts',
 'cluster_to_cluster_annotation_membership',
 'cluster_to_cluster_annotation_membership_color',
 'cluster_to_cluster_annotation_membership_pivoted']

In [10]:
# Size of metadata
abc_cache.get_directory_metadata_size("WMB-taxonomy")

'4.65 MB'

## Download the files

In [8]:
allen_wmb_data = abc_cache.get_directory_data('WMB-10Xv3')
print("WMB-10Xv3 data files:\n\t", allen_wmb_data)

WMB-10Xv3 data files:
	 [PosixPath('/home/jantine/Data/AllenBrain/abc_atlas/expression_matrices/WMB-10Xv3/20230630/WMB-10Xv3-CB-log2.h5ad'), PosixPath('/home/jantine/Data/AllenBrain/abc_atlas/expression_matrices/WMB-10Xv3/20230630/WMB-10Xv3-CB-raw.h5ad'), PosixPath('/home/jantine/Data/AllenBrain/abc_atlas/expression_matrices/WMB-10Xv3/20230630/WMB-10Xv3-CTXsp-log2.h5ad'), PosixPath('/home/jantine/Data/AllenBrain/abc_atlas/expression_matrices/WMB-10Xv3/20230630/WMB-10Xv3-CTXsp-raw.h5ad'), PosixPath('/home/jantine/Data/AllenBrain/abc_atlas/expression_matrices/WMB-10Xv3/20230630/WMB-10Xv3-HPF-log2.h5ad'), PosixPath('/home/jantine/Data/AllenBrain/abc_atlas/expression_matrices/WMB-10Xv3/20230630/WMB-10Xv3-HPF-raw.h5ad'), PosixPath('/home/jantine/Data/AllenBrain/abc_atlas/expression_matrices/WMB-10Xv3/20230630/WMB-10Xv3-HY-log2.h5ad'), PosixPath('/home/jantine/Data/AllenBrain/abc_atlas/expression_matrices/WMB-10Xv3/20230630/WMB-10Xv3-HY-raw.h5ad'), PosixPath('/home/jantine/Data/AllenBrain/ab


	Total directory size = 176.41 GB




In [14]:
# TODO
allen_wmb_metadata = abc_cache.get_directory_metadata('WMB-taxonomy')
print("WMB metadata files:\n\t", allen_wmb_metadata)

EndpointConnectionError: Could not connect to the endpoint URL: "https://allen-brain-cell-atlas.s3.amazonaws.com/?list-type=2&prefix=metadata%2FWMB-taxonomy%2F20231215%2Fcluster.csv&encoding-type=url"

# Explor whole brain data through visualisation

In [13]:
cell = abc_cache.get_metadata_dataframe(directory="WMB-10Xv3", file_name="cell_metadata")
cell.set_index("cell_label", inplace=True)

KeyError: 'File cell_metadata not found in directory WMB-10Xv3.'

# Prepare Data

In [12]:
# # Set random seed for reproducibility
# np.random.seed(42)

# # Download and load your dataset (example with Allen Brain data)
# # Replace this URL with the specific dataset you want to use
# url = "https://storage.googleapis.com/allen-brain-cell-atlas/ata_2022/raw/mouse_v1_alm.h5ad"
# adata = sc.read(url)

# # Alternatively, load local data
# # adata = sc.read_h5ad("path/to/your/data.h5ad")


In [None]:
# Basic preprocessing
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

# Calculate quality metrics
adata.var["mt"] = adata.var_names.str.startswith("mt-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)

# Filter cells based on QC metrics
adata = adata[adata.obs.n_genes_by_counts < 5000, :]
adata = adata[adata.obs.pct_counts_mt < 20, :]

# Normalize and find highly variable genes
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)

# Keep only highly variable genes for the model
adata = adata[:, adata.var.highly_variable]

# Save the preprocessed data
adata.write("preprocessed_data.h5ad")