# EDA

## Open data


In [None]:
import scanpy as sc

In [None]:
# 1. Load and explore
adata = sc.read(
    "../data/pancreas.h5ad",
    backup_url="https://www.dropbox.com/s/qj1jlm9w10wmt0u/pancreas.h5ad?dl=1",
)
print(adata)

In [None]:
adata.X

In [None]:
adata.raw.X.toarray()

In [None]:
adata.obs

In [None]:
adata.var

In [None]:
print(f"Cell types: {adata.obs.celltype.value_counts()}")

## Visualization


In [None]:
print("Shape PCA: ", adata.obsm["X_pca"].shape)
print("Shape UMAP: ", adata.obsm["X_umap"].shape)

In [None]:
# Plot UMAP colored by metadata fields in .obs
sc.pl.umap(adata, color=["celltype", "sample"])

In [None]:
# sc.pl.highest_expr_genes(adata, n_top=20)

# Preprocessing

In [None]:
adata

In [None]:
adata.raw.X.shape

In [None]:
# 2. Standard preprocessing of data
sc.pp.filter_genes(adata, min_cells=10)
sc.pp.filter_cells(adata, min_genes=200)
# sc.pp.normalize_total(adata)
# sc.pp.log1p(adata)

print(f"Shape: {adata.shape}")

## Explo


In [None]:
adata = sc.read(
    "../data/pancreas.h5ad",
    backup_url="https://www.dropbox.com/s/qj1jlm9w10wmt0u/pancreas.h5ad?dl=1",
)
print(adata)

In [None]:
target_genes = 1000
sc.pp.highly_variable_genes(
    adata, flavor="cell_ranger", n_top_genes=target_genes, batch_key="batch"
)

In [None]:
# As we don't have enough target genes, we need to consider the 'next best' HVGs
n_batches = len(adata.obs["batch"].cat.categories)
# These are the genes that are variable across all batches
nbatch1_dispersions = adata.var["dispersions_norm"][
    adata.var.highly_variable_nbatches > n_batches - 1
]
nbatch1_dispersions.sort_values(ascending=False, inplace=True)
print(len(nbatch1_dispersions))

In [None]:
adata = sc.read(
    "../data/pbmc_10k_protein_v3.h5ad",
    backup_url="https://github.com/YosefLab/scVI-data/raw/master/pbmc_10k_protein_v3.h5ad?raw=true",
)

adata

In [None]:
# 2. Standard preprocessing of data
sc.pp.filter_genes(adata, min_cells=10)
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

print(f"Shape: {adata.shape}")
print(adata.X)