### 1. Install dependencies and load packages

In [None]:
# or create a conda env
!pip install scanpy

In [5]:
import os
import scanpy as sc
import pandas as pd
import logging

# Init logger
logger = logging.Logger('logger')
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

### 2. Download dataset

In [1]:
os.makedirs('data', exist_ok=True)

In [2]:
# Bigger dataset (~1.2GB) (one of the ones we mainly use)
!wget -O data/ReplogleWeissmann2022_rpe1.h5ad "https://zenodo.org/record/7041849/files/ReplogleWeissman2022_rpe1.h5ad"
# Smaller dataset (~600MB)
!wget -O data/NormanWeissman2019_filtered.h5ad "https://zenodo.org/record/7041849/files/NormanWeissman2019_filtered.h5ad"

--2025-05-14 17:04:27--  https://zenodo.org/record/7041849/files/ReplogleWeissman2022_rpe1.h5ad
Resolving zenodo.org (zenodo.org)... 188.185.45.92, 188.185.43.25, 188.185.48.194, ...
Connecting to zenodo.org (zenodo.org)|188.185.45.92|:443... connected.
HTTP request sent, awaiting response... 301 MOVED PERMANENTLY
Location: /records/7041849/files/ReplogleWeissman2022_rpe1.h5ad [following]
--2025-05-14 17:04:27--  https://zenodo.org/records/7041849/files/ReplogleWeissman2022_rpe1.h5ad
Reusing existing connection to zenodo.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 1236886900 (1.2G) [application/octet-stream]
Saving to: ‘data/ReplogleWeissmann2022_rpe1.h5ad’


2025-05-14 17:05:09 (28.0 MB/s) - ‘data/ReplogleWeissmann2022_rpe1.h5ad’ saved [1236886900/1236886900]



### 3. Read dataset

In [3]:
import scanpy as sc

adata_p = 'data/ReplogleWeissmann2022_rpe1.h5ad'
adata = sc.read(adata_p)



### Data uses this format (anndata): https://anndata.readthedocs.io/en/stable/
#### Pytorch integration module (scvi-tools): https://scvi-tools.org/

In [4]:
# Print out data overview
adata

AnnData object with n_obs × n_vars = 247914 × 8749
    obs: 'batch', 'gene', 'gene_id', 'transcript', 'gene_transcript', 'guide_id', 'percent_mito', 'UMI_count', 'z_gemgroup_UMI', 'core_scale_factor', 'core_adjusted_UMI_count', 'disease', 'cancer', 'cell_line', 'sex', 'age', 'perturbation', 'organism', 'perturbation_type', 'tissue_type', 'ncounts', 'ngenes', 'nperts', 'percent_ribo', 'celltype'
    var: 'chr', 'start', 'end', 'class', 'strand', 'length', 'in_matrix', 'mean', 'std', 'cv', 'fano', 'ensembl_id', 'ncounts', 'ncells'

### adata.shape = (cells x genes)
#### - this dataset has 247914 cells and 8749 genes
### .obs = observations (pd.DataFrame) --> meta information on each cell
#### - obs:  column names
### .var = variables (pd.DataFrame) --> meta information on each gene
#### - var: column names
### adata.X = Expression matrix (CSR/CSC sparse matrix or np.ndarray)
#### - this is where the actual gene expression data is stored
### adata.layers = Storage for alterations of X, like normalized or scaled expression

### 4. Check meta

In [8]:
# We are trying to classify the .obs['perturbation'] column with respect to cell type and perturbation type (CRISPR type)

# define all labels to classify on
cls_labels = ['celltype', 'perturbation_type', 'perturbation']
# create classification label
adata.obs['cls_label'] = adata.obs[cls_labels].agg(';'.join, axis=1)
logging.info(f'Dataset has {adata.obs.cls_label.nunique()} classes.')

2025-05-14 17:20:59,548 - INFO - Dataset has 2394 classes.


In [9]:
# Check for class imbalances etc.
adata.obs.cls_label.value_counts()

cls_label
retinal pigment epithelial cells;CRISPR;control    11485
retinal pigment epithelial cells;CRISPR;TFAM        3580
retinal pigment epithelial cells;CRISPR;SLC1A5      1962
retinal pigment epithelial cells;CRISPR;GFM1        1699
retinal pigment epithelial cells;CRISPR;MRPL36      1686
                                                   ...  
retinal pigment epithelial cells;CRISPR;ZC3H8          4
retinal pigment epithelial cells;CRISPR;CDK7           4
retinal pigment epithelial cells;CRISPR;NAT10          3
retinal pigment epithelial cells;CRISPR;NPAT           3
retinal pigment epithelial cells;CRISPR;NUP93          2
Name: count, Length: 2394, dtype: int64