In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
hdata1 = sc.read_10x_mtx(
    '../data/cancerdata/healthy_cd34_bm/filtered_matrices_10xgenomics/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading


In [None]:
hdata2 = sc.read_10x_mtx(
    '../data/cancerdata/healthy_cd34_bm/hua_et_al/GRCh38_1',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading


In [None]:
hdata2 = sc.read_10x_mtx(
    '../data/cancerdata/healthy_cd34_bm/hua_et_al/GRCh38_1',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading


In [None]:
hdata3 = sc.read_10x_mtx(
    '../data/cancerdata/healthy_cd34_bm/hua_et_al/GRCh38_2',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading


In [None]:
hdata4 = sc.read_10x_mtx(
    '../data/cancerdata/healthy_cd34_bm/hua_et_al/GRCh38_3',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading


In [None]:
hdata5 = sc.read_10x_mtx(
    '../data/cancerdata/healthy_cd34_bm/hua_et_al/GRCh38_4',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading


In [None]:
healthy_data = ad.concat([hdata1, hdata2, hdata3, hdata4, hdata5], label="individual", index_unique="_")
healthy_data.var_names_make_unique()
healthy_data.obs_names_make_unique()

In [None]:
healthy_data

In [None]:
healthy_data.obs['disease_status'] = 0
healthy_data.obs

In [None]:
for index, row in healthy_data.obs.iterrows():
    individual_string = row['individual'] + " healthy"
    healthy_data.obs.at[index, 'individual_unique'] = individual_string

In [None]:
healthy_data.obs

In [None]:
cdata1 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment1/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata2 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment2/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata3 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment3/',  # the directory with the `.mtx` file
    prefix="GSM6447689_LTB5109.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata4 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment4/',  # the directory with the `.mtx` file
    prefix="GSM6447695_SF-100109-106293.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata5 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment5/',  # the directory with the `.mtx` file
    prefix="GSM6447696_SF-100109-111451.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata6 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment6/',  # the directory with the `.mtx` file
    prefix="GSM6447697_SF-120628-00475.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata7 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment7/',  # the directory with the `.mtx` file
    prefix="GSM6447698_SF-130612-00056.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata8 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment8/',  # the directory with the `.mtx` file
    prefix="GSM6447699_SF-100109-110236.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata9 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment9/',  # the directory with the `.mtx` file
    prefix="GSM6447700_SF-140401-00158.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata10 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment10/',  # the directory with the `.mtx` file
    prefix="GSM6447701_SF-140602-00025.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata11 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment11/',  # the directory with the `.mtx` file
    prefix="GSM6447702_SF-140722-00012.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata12 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment12/',  # the directory with the `.mtx` file
    prefix="GSM6447704_2V001.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata13 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment13/',  # the directory with the `.mtx` file
    prefix="GSM6447705_SF-141010-00049.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata14 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment14/',  # the directory with the `.mtx` file
    prefix="GSM6447706_SF-161129-00158.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata15 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment15/',  # the directory with the `.mtx` file
    prefix="GSM6447707_6AE001.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata16 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment16/',  # the directory with the `.mtx` file
    prefix="GSM6447708_6AC001.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata17 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment17/',  # the directory with the `.mtx` file
    prefix="GSM6447709_6AD001.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata18 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment18/',  # the directory with the `.mtx` file
    prefix="GSM6447710_SF-100109-101914.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata19 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment19/',  # the directory with the `.mtx` file
    prefix="GSM6447711_SF-120425-00035.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata20 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment20/',  # the directory with the `.mtx` file
    prefix="GSM6447712_6AD001.rep2.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata21 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment21/',  # the directory with the `.mtx` file
    prefix="GSM6447713_SF-120926-00014.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata22 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment22/',  # the directory with the `.mtx` file
    prefix="GSM6447714_SF-140318-00065.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata23 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment23/',  # the directory with the `.mtx` file
    prefix="GSM6447715_SF-140507-00419.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata24 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment24/',  # the directory with the `.mtx` file
    prefix="GSM6447716_SF-160268-00045.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata25 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment25/',  # the directory with the `.mtx` file
    prefix="GSM6447717_SF-160722-00003.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cdata26 = sc.read_10x_mtx(
    '../data/cancerdata/cancer_cd34_bm/notreatment26/',  # the directory with the `.mtx` file
    prefix="GSM6447718_SF-161123-00029.",
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [None]:
cancer_data = ad.concat([cdata1, cdata2, cdata3, cdata4, cdata5, cdata6, cdata7, cdata8, cdata9, cdata10, cdata11, cdata12, cdata13, cdata14, cdata15, cdata16, cdata17, cdata18, cdata19, cdata20, cdata21, cdata22, cdata23, cdata24, cdata25, cdata26], label="individual", index_unique="_")
cancer_data.var_names_make_unique()
cancer_data.obs_names_make_unique()

In [None]:
cancer_data.obs['disease_status'] = 1
cancer_data.obs

In [None]:
for index, row in cancer_data.obs.iterrows():
    individual_string = row['individual'] + " cancer"
    cancer_data.obs.at[index, 'individual_unique'] = individual_string

In [None]:
cancer_data.obs

In [None]:
cancer_data.obs

In [None]:
all_data = ad.concat([healthy_data, cancer_data], label="dataset", index_unique="_")
all_data.var_names_make_unique()
all_data.obs_names_make_unique()
all_data

In [None]:
all_data.obs

---
Scanpy Tutorial

In [None]:
sc.pp.filter_cells(all_data, min_genes=200)
sc.pp.filter_genes(all_data, min_cells=3)

In [None]:
all_data.layers["counts"] = all_data.X.copy() #this is needed for scvi later

In [None]:
sc.pp.normalize_total(all_data, target_sum=1e4)

In [None]:
sc.pp.log1p(all_data)

In [None]:
sc.pp.highly_variable_genes(all_data, min_mean=0.0125, max_mean=3, min_disp=0.5)

In [None]:
sc.pl.highly_variable_genes(all_data)

In [None]:
all_data.raw = all_data

In [None]:
sc.pp.scale(all_data, max_value=10)

In [None]:
sc.tl.pca(all_data, svd_solver='arpack')

In [None]:
sc.pl.pca(all_data, color='disease_status')

In [None]:
sc.pl.pca_variance_ratio(all_data, log=False)

In [None]:
sc.pp.neighbors(all_data, n_neighbors=10, n_pcs=50)

In [None]:
sc.tl.umap(all_data)

In [None]:
sc.pl.umap(all_data, color=['disease_status'])

----
SCVI on all_data as count matrix

In [None]:
import scvi

In [None]:
scvi.model.SCVI.setup_anndata(all_data, layer="counts", categorical_covariate_keys=["individual_unique"])

In [None]:
vae = scvi.model.SCVI(all_data)
vae

In [None]:
vae.train()