# Data Setup & Environment

This notebook downloads the 10x Genomics Visium Human Breast Cancer (Visium v1.3.0) sample into `data/raw/Visium_Human_Breast_Cancer/` and extracts the files.

Dataset page:
https://www.10xgenomics.com/datasets/human-breast-cancer-visium-fresh-frozen-whole-transcriptome-1-standard

Requirements:
Download all required files, set up your Python environment, load the AnnData object, and verify data integrity before analysis begins.

# Download Script

In [2]:
from __future__ import annotations

import shutil
import subprocess
import tarfile
from pathlib import Path

# Resolve project root whether notebook is run from repo root or scripts/
cwd = Path.cwd().resolve()
if (cwd / 'README.md').exists() and (cwd / 'data').exists():
    project_root = cwd
elif (cwd.parent / 'README.md').exists() and (cwd.parent / 'data').exists():
    project_root = cwd.parent
else:
    raise RuntimeError(
        f'Cannot locate project root from cwd={cwd}. Start Jupyter in the repo root '
        'or in the scripts/ folder.'
    )

raw_dir = project_root / 'data' / 'raw' / 'Visium_Human_Breast_Cancer'
raw_dir.mkdir(parents=True, exist_ok=True)
print('Raw dir:', raw_dir)

BASE = 'https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_Human_Breast_Cancer'
files = [
    'Visium_Human_Breast_Cancer_spatial.tar.gz',
    'Visium_Human_Breast_Cancer_filtered_feature_bc_matrix.h5',
    'Visium_Human_Breast_Cancer_raw_feature_bc_matrix.h5',
    'Visium_Human_Breast_Cancer_molecule_info.h5',
    'Visium_Human_Breast_Cancer_analysis.tar.gz',
]

wget = shutil.which('wget')
if not wget:
    raise RuntimeError(
        '`wget` not found on PATH. Install it (e.g., Git Bash / WSL / conda) '
        'or tell me and I will switch this notebook to Python-only downloads.'
    )

# Download into raw_dir (resume: -c)
for name in files:
    url = f'{BASE}/{name}'
    cmd = [wget, '-c', '-P', str(raw_dir), url]
    print(' '.join(cmd))
    subprocess.run(cmd, check=True)

# Extract tarballs into raw_dir
tar = shutil.which('tar')
for tgz_name in ['Visium_Human_Breast_Cancer_spatial.tar.gz', 'Visium_Human_Breast_Cancer_analysis.tar.gz']:
    tgz = raw_dir / tgz_name
    if not tgz.exists():
        continue

    if tar:
        subprocess.run([tar, '-xzf', str(tgz), '-C', str(raw_dir)], check=True)
    else:
        with tarfile.open(tgz, mode='r:gz') as tf:
            tf.extractall(raw_dir)

print('Done.')


Raw dir: C:\Users\mmsid\Documents\github\My Project\spatial_biology_project\data\raw\Visium_Human_Breast_Cancer
C:\ProgramData\chocolatey\bin\wget.EXE -c -P C:\Users\mmsid\Documents\github\My Project\spatial_biology_project\data\raw\Visium_Human_Breast_Cancer https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_Human_Breast_Cancer/Visium_Human_Breast_Cancer_spatial.tar.gz
C:\ProgramData\chocolatey\bin\wget.EXE -c -P C:\Users\mmsid\Documents\github\My Project\spatial_biology_project\data\raw\Visium_Human_Breast_Cancer https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_Human_Breast_Cancer/Visium_Human_Breast_Cancer_filtered_feature_bc_matrix.h5
C:\ProgramData\chocolatey\bin\wget.EXE -c -P C:\Users\mmsid\Documents\github\My Project\spatial_biology_project\data\raw\Visium_Human_Breast_Cancer https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_Human_Breast_Cancer/Visium_Human_Breast_Cancer_raw_feature_bc_matrix.h5
C:\ProgramData\chocolatey\bin\wget.EXE -c -P C:\Users

# Python Setup & Loading

In [3]:
# Install Packages
%pip install -q scanpy squidpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: C:\Users\mmsid\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
import os
import tempfile
from pathlib import Path

# Workaround for Windows Store Python + numba caching: use a writable cache dir
os.environ.setdefault('NUMBA_CACHE_DIR', str(Path(tempfile.gettempdir()) / 'numba_cache'))
Path(os.environ['NUMBA_CACHE_DIR']).mkdir(parents=True, exist_ok=True)

import scanpy as sc
import squidpy as sq
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

sc.settings.verbosity = 3
fig_dir = project_root / 'figures' / 'qc'
fig_dir.mkdir(parents=True, exist_ok=True)
sc.settings.figdir = str(fig_dir)

# Load Visium data from the folder we downloaded/extracted into
adata = sc.read_visium(
    path=str(raw_dir),
    count_file='Visium_Human_Breast_Cancer_filtered_feature_bc_matrix.h5',
)

adata.var_names_make_unique()

print(adata)  
print(adata.obsm['spatial'].shape)  # Spot coordinates


  from .autonotebook import tqdm as notebook_tqdm
  left = partial(_left_join_spatialelement_table)
  left_exclusive = partial(_left_exclusive_join_spatialelement_table)
  inner = partial(_inner_join_spatialelement_table)
  right = partial(_right_join_spatialelement_table)
  right_exclusive = partial(_right_exclusive_join_spatialelement_table)


reading C:\Users\mmsid\Documents\github\My Project\spatial_biology_project\data\raw\Visium_Human_Breast_Cancer\Visium_Human_Breast_Cancer_filtered_feature_bc_matrix.h5


  adata = sc.read_visium(


 (0:00:00)
AnnData object with n_obs × n_vars = 4898 × 36601
    obs: 'in_tissue', 'array_row', 'array_col'
    var: 'gene_ids', 'feature_types', 'genome'
    uns: 'spatial'
    obsm: 'spatial'
(4898, 2)


  utils.warn_names_duplicates("var")


In [5]:
# Check how many are actually on tissue
print(adata.obs['in_tissue'].value_counts())

# Filter to tissue-only spots
adata = adata[adata.obs['in_tissue'] == 1].copy()
print(adata)  # Expect spots after filtering

in_tissue
1    4898
Name: count, dtype: int64


AnnData object with n_obs × n_vars = 4898 × 36601
    obs: 'in_tissue', 'array_row', 'array_col'
    var: 'gene_ids', 'feature_types', 'genome'
    uns: 'spatial'
    obsm: 'spatial'


In [6]:
import numpy as np

# Correct for sparse matrix (csr_matrix)
print(f"Spots on tissue  : {adata.n_obs}")
print(f"Genes detected   : {adata.n_vars}")
print(f"Median UMI/spot  : {np.median(adata.X.sum(axis=1).A1):.0f}")
print(f"Median genes/spot: {np.median((adata.X > 0).sum(axis=1).A1):.0f}")
print(f"Total UMI count  : {adata.X.sum():.0f}")
print('-> Next: 02_QC & Preprocessing.ipynb')

Spots on tissue  : 4898
Genes detected   : 36601
Median UMI/spot  : 9720
Median genes/spot: 3654
Total UMI count  : 50107992
-> Next: 02_QC & Preprocessing.ipynb
