# AIDO.Cell Quickstart

This demo quickly walks through installing AIDO.Cell and embedding new single cell data.

__Requirements__:
- A100 GPU or equivalent

### Install ModelGenerator and extra dependencies for tutorials

In [None]:
!git clone https://github.com/genbio-ai/ModelGenerator.git
%cd ModelGenerator
!pip install -e ".[flash_attn]"
%cd experiments/AIDO.Cell
!pip install -r requirements.txt

In [None]:
# Restart the session after installing

# Then navigate back to the AIDO.Cell directory
%cd ModelGenerator/experiments/AIDO.Cell

### Grab some data from GEO and load into anndata

In [None]:
%%bash
mkdir -p data
cd data
wget -nv -O data.h5ad 'https://datasets.cellxgene.cziscience.com/cff5f6b2-bf37-400e-96dc-470d480f581e.h5ad'
cd ..

In [1]:
import anndata as ad
import scanpy as sc

adata = ad.read_h5ad('data/data.h5ad')
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_genes(adata, min_cells=3)
# No more normalization needed, AIDO.Cell uses raw counts

### Preprocess the anndata for AIDO.Cell

In [2]:
import warnings
warnings.filterwarnings('ignore')
import cell_utils
aligned_adata, attention_mask = cell_utils.align_adata(adata, is_esng=True)

###########  Aligning data to AIDO.Cell  ###########
AIDO.Cell was pretrained on a fixed set of 19264 genes.
Aligning your data to the AIDO.Cell gene set...
[94m•[0m standardized 19208/19264 terms
734 in your data that cannot be used by AIDO.Cell. Removing these.
['ENSG00000011052' 'ENSG00000026036' 'ENSG00000068781' 'ENSG00000081818'
 'ENSG00000093134' 'ENSG00000099725' 'ENSG00000100101' 'ENSG00000108825'
 'ENSG00000111780' 'ENSG00000112852' 'ENSG00000113205' 'ENSG00000113209'
 'ENSG00000113211' 'ENSG00000113212' 'ENSG00000113248' 'ENSG00000114786'
 'ENSG00000120322' 'ENSG00000120324' 'ENSG00000120327' 'ENSG00000120328'
 'ENSG00000124208' 'ENSG00000124593' 'ENSG00000125462' 'ENSG00000125695'
 'ENSG00000125954' 'ENSG00000130649' 'ENSG00000131152' 'ENSG00000133475'
 'ENSG00000135747' 'ENSG00000137878' 'ENSG00000141979' 'ENSG00000142396'
 'ENSG00000144785' 'ENSG00000151779' 'ENSG00000158483' 'ENSG00000159239'
 'ENSG00000162006' 'ENSG00000166984' 'ENSG00000167774' 'ENSG00000167807'
 'EN

### Get AIDO.Cell embeddings

AIDO.Cell was designed for use with the ModelGenerator CLI. 
It is strongly recommended to use ModelGenerator for using the AIDO.Cell model.
For more information check out
- [Using ModelGenerator to finetune AIDO.Cell](./tutorial_cell_classification.ipynb)
- [ModelGenerator Docs](https://genbio-ai.github.io/ModelGenerator/)

In [None]:
# Embed
import anndata as ad
import numpy as np
import torch
import sys
from modelgenerator.tasks import Embed

# The following is equivalent to the ModelGenerator CLI command:
# mgen predict --model Embed --model.backbone aido_cell_3m \
#   --data CellClassificationDataModule --data.test_split_files <your_anndata>.h5ad

# If not using mgen, this should be configured manually.
device = 'cuda'
batch_size = 2

model = Embed.from_config({
        "model.backbone": "aido_cell_3m",
        "model.batch_size": batch_size
    }).eval()
model = model.to(device).to(torch.bfloat16)

# All data must be in bfloat16
batch_np = aligned_adata[:batch_size].X.toarray()
batch_tensor = torch.from_numpy(batch_np).to(torch.bfloat16).to(device)
# Call transform and embed.
batch_transformed = model.transform({'sequences': batch_tensor})
embs = model(batch_transformed)

# Full Embeddings
print('FULL EMBEDDING')
print('(batch_size, genes, embedding_dim)')
print(embs.shape)
print(embs)
print('-------------------------------------')

# Non-Zero Genes Embeddings
print('NON-ZERO GENES EMBEDDING')
embs = embs[:, attention_mask.astype(bool), :]
print('(batch_size, genes, embedding_dim)')
print(embs.shape)
print(embs)

FULL EMBEDDING
(batch_size, genes, embedding_dim)
torch.Size([2, 19264, 128])
tensor([[[-1.0703e+00, -2.4121e-01, -2.1719e+00,  ...,  1.6211e-01,
           6.0654e-04, -3.7305e-01],
         [-6.4062e-01, -2.2344e+00, -1.6484e+00,  ..., -8.0469e-01,
           4.1406e-01, -1.3672e+00],
         [ 4.1406e-01, -1.9062e+00, -6.0547e-01,  ..., -1.0391e+00,
          -6.2012e-02, -3.2812e-01],
         ...,
         [-4.0039e-01, -2.7500e+00, -1.8906e+00,  ..., -3.6328e-01,
          -8.1055e-02,  5.6250e-01],
         [ 7.2266e-02,  3.6133e-01, -1.0469e+00,  ...,  1.2598e-01,
           3.7891e-01, -1.2207e-01],
         [-1.0859e+00, -1.9922e+00, -9.1016e-01,  ..., -7.7734e-01,
           9.1016e-01, -1.0742e-01]],

        [[-1.0469e+00, -2.3633e-01, -2.3438e+00,  ...,  2.1777e-01,
           7.7148e-02, -3.2031e-01],
         [-6.3672e-01, -2.2656e+00, -1.5469e+00,  ..., -8.5938e-01,
           4.8438e-01, -1.2969e+00],
         [ 4.5898e-01, -2.0312e+00, -5.9766e-01,  ..., -1.0859e+00