In [None]:
import torch
%env CUDA_VISIBLE_DEVICES=0
torch.cuda.is_available()

In [None]:
import scvi

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import hotspot
import pickle
import seaborn as sns
import gcsfs

## load Boiarsky et al MM data

In [None]:
!gsutil cp gs://rebecca-summer23/cd138_adata_MOREHVG_noIG.h5ad /tmp

In [None]:
cd138_adata = sc.read_h5ad("/tmp/cd138_adata_MOREHVG_noIG.h5ad")

In [None]:
cd138_adata.var.highly_variable.sum()

### better define HVGs

In [None]:
sc.pp.highly_variable_genes(cd138_adata, min_mean=0.0125, max_mean=4, min_disp=0.3,layer="lognorm")
cd138_adata.var.highly_variable.sum()

In [None]:
sc.pl.highly_variable_genes(cd138_adata)


## run vanilla scVI 

In [None]:
scvi.model.SCVI.setup_anndata(cd138_adata, layer="counts")

In [None]:
vae = scvi.model.SCVI(cd138_adata, gene_likelihood='nb')

In [None]:
vae.train()

In [None]:
!gsutil cp gs://rebecca-summer23/outputs/vanilla_scvi_cd138.pkl /tmp

In [None]:
file = open('/tmp/vanilla_scvi_cd138.pkl', 'rb')

# dump information to that file
vae = pickle.load(file)

# close the file
file.close()

In [None]:
cd138_adata.obsm["X_scvi"] = vae.get_latent_representation()

In [None]:
#save original UMAP
cd138_adata.obsm["X_umap_pca-based"] = cd138_adata.obsm['X_umap']

In [None]:
#post-code changes -- not qualitatively different (which is good, since I did not pass in CNVs in this test)
sc.pp.neighbors(cd138_adata, use_rep="X_scvi")
sc.tl.umap(cd138_adata)

In [None]:
sc.pl.umap(cd138_adata, color=["disease_stage", "person"])

#### calculate PC regression for vanilla scVI latent

In [None]:
from sklearn.metrics import r2_score 

sig = "logW24"

r2s = []
for i in np.arange(10):
    r2s.append(r2_score(cd138_adata.obs[sig], cd138_adata.obsm['X_scvi'][:,i]))
#print(np.sum(np.array(r2s)*cd138_adata.uns['pca']['variance'][:n_pcs]))
print(np.sum(np.array(r2s)))

#### expression of NMF modules

In [None]:
sc.pl.umap(cd138_adata, color=['logW3', 'logW4', 'logW5', 'logW8', 'logW9', 'logW11', 'logW16', 'logW20', 'logW24', 'logW28'])

#### color by translocation

In [None]:
sc.pl.umap(cd138_adata, color=['driver event', 'driver_event_specific'])

# run vanilla scvi with HVGs (do we need to retain more HVGs?)

In [None]:
cd138_adata_hvg = cd138_adata[:,cd138_adata.var.highly_variable].copy()

In [None]:
scvi.model.SCVI.setup_anndata(cd138_adata_hvg, layer="counts")

In [None]:
vae = scvi.model.SCVI(cd138_adata_hvg, gene_likelihood='nb')

In [None]:
vae.train()

In [None]:
cd138_adata.obsm["X_scvi_hvg"] = vae.get_latent_representation()

In [None]:
sc.pp.neighbors(cd138_adata, use_rep="X_scvi_hvg")
sc.tl.umap(cd138_adata)

In [None]:
sc.pl.umap(cd138_adata, color=["disease_stage", "person"])

In [None]:
sc.pl.umap(cd138_adata, color=["disease_stage", "person"])

## batch correct by patient in vanilla scVI 

In [None]:
cd138_adata_hvg = cd138_adata[:,cd138_adata.var.highly_variable].copy()

In [None]:
scvi.model.SCVI.setup_anndata(cd138_adata_hvg, layer="counts", batch_key="person")

In [None]:
vae = scvi.model.SCVI(cd138_adata_hvg, gene_likelihood='nb')

In [None]:
vae.train()

In [None]:
cd138_adata.obsm["X_scvi_batchcorrect"] = vae.get_latent_representation()

In [None]:
sc.pp.neighbors(cd138_adata, use_rep="X_scvi_batchcorrect")
sc.tl.umap(cd138_adata)
sc.pl.umap(cd138_adata, color=["person","disease_stage"])

### do things just look really bad bc not enough HVGs were retained? might this have been the problem for mrVI too? (I think I was getting noise even with all genes for mrVI but double check)
### might want to recalculate HVGs to include more before continuing

In [None]:
#now with more hvgs
cd138_adata_hvg = cd138_adata[:,cd138_adata.var.highly_variable].copy()
cd138_adata_hvg.shape

In [None]:
scvi.model.SCVI.setup_anndata(cd138_adata_hvg, layer="counts", batch_key="person")

In [None]:
vae = scvi.model.SCVI(cd138_adata_hvg, gene_likelihood='nb')

In [None]:
vae.train()

In [None]:
cd138_adata.obsm["X_scvi_batchcorrect_morehvg"] = vae.get_latent_representation()

In [None]:
sc.pp.neighbors(cd138_adata, use_rep="X_scvi_batchcorrect_morehvg")
sc.tl.umap(cd138_adata)
sc.pl.umap(cd138_adata, color=["person","disease_stage"])

In [None]:
sc.pl.umap(cd138_adata, color="disease_stage")

### if I run again patient batch crxn again with all genes, do I recover original latent (with disease stages pretty well separated?)

In [None]:
scvi.model.SCVI.setup_anndata(cd138_adata, layer="counts", batch_key="person")

In [None]:
vae = scvi.model.SCVI(cd138_adata, gene_likelihood='nb')

In [None]:
vae.train()

In [None]:
#pickle mrvi sample=person results
import pickle 

# open a file, where you ant to store the data
file = open('outputs/scvi_ptbatchcorrect_allgenes.pkl', 'wb')

# dump information to that file
pickle.dump(vae, file)

# close the file
file.close()

!gsutil mv outputs/scvi_ptbatchcorrect_allgenes.pkl gs://rebecca-summer23/outputs/scvi_ptbatchcorrect_allgenes.pkl

In [None]:
cd138_adata.obsm["X_scvi_ptbatchcorrect_allgenes"] = vae.get_latent_representation()

In [None]:
sc.pp.neighbors(cd138_adata, use_rep="X_scvi_ptbatchcorrect_allgenes")
sc.tl.umap(cd138_adata)

In [None]:
sc.pl.umap(cd138_adata, color=["person","disease_stage"])

In [None]:
sc.pl.umap(cd138_adata, color=["person","disease_stage"])

In [None]:
sc.pl.umap(cd138_adata, color=['logW3', 'logW4', 'logW5', 'logW8', 'logW9', 'logW11', 'logW16', 'logW20', 'logW24', 'logW28'], ncols=3)

In [None]:
sc.pl.umap(cd138_adata, color="disease_stage")

In [None]:
sc.pl.umap(cd138_adata, color=['driver event', 'driver_event_specific'])

In [None]:
from sklearn.metrics import r2_score 

sig = "logW24"

r2s = []
for i in np.arange(10):
    r2s.append(r2_score(cd138_adata.obs[sig], cd138_adata.obsm['X_scvi_batchcorrect_allgenes'][:,i]))
#print(np.sum(np.array(r2s)*cd138_adata.uns['pca']['variance'][:n_pcs]))
print(np.sum(np.array(r2s)))

# get and format cnv results for input to scVI

In [None]:
cnv_genes = pd.read_table("gs://rebecca-summer23/outputs/infercnv/cd138/HMM_CNV_predictions.HMMi6.hmm_mode-samples.Pnorm_0.5.pred_cnv_genes.dat", sep="\t")

In [None]:
# parse sample name
cnv_genes['sid'] = [s.split(".")[0] for s in cnv_genes.cell_group_name]
cnv_genes['sid'] = [s.split("_")[1] for s in cnv_genes.sid]

In [None]:
cnv_genes['dose'] = cnv_genes.state.map({1:0, 2:0.5, 3:1, 4:1.5, 5:2, 6:3})

In [None]:
#make plot of inferred dosages
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams.update({'font.size': 16})
sns.barplot(cnv_genes.groupby('dose').count().reset_index(), x="dose", y="state")
plt.ylabel("counts (# gene-patient pairs)")
plt.xlabel("inferred CNV dosage")

In [None]:
# how many patients and how many genes have inferred CNVs
print(len(cnv_genes.gene.drop_duplicates()))
print(len(cnv_genes.sid.drop_duplicates()))

In [None]:
# how many patients and how many genes have inferred CNVs
print(len(cnv_genes[cnv_genes.dose!=1].gene.drop_duplicates()))
print(len(cnv_genes[cnv_genes.dose!=1].sid.drop_duplicates()))

In [None]:
cnv_genes['person-gene'] = [cnv_genes.iloc[i].sid + cnv_genes.iloc[i].gene for i in np.arange(len(cnv_genes))]

In [None]:
# the same gene can be in two different states in the same sample, I guess if the gene spans two regions that have different copy number estimates
cnv_genes[cnv_genes['person-gene']=="MGUS-6MRPL40"]

In [None]:
#this is not very common, there are 130 gene-sample pairs with duplicates
cnv_genes['person-gene'].duplicated().sum()

In [None]:
# for each of these person-gene pairs that are duplicated, need to choose one row
# the doses for this group are either 1, 1.5, or 2.0. Let's be conservative and always keep lower of the doses
cnv_genes['dose'] = cnv_genes.groupby('person-gene')['dose'].transform('min')

In [None]:
#drop dose=1, since that is the same as the default
cnv_genes = cnv_genes[cnv_genes.dose!=1]

In [None]:
# add cell info (in this case, same row will be repeated for all cells in sample since cnvs are sample level)
# in order to create sparse matrix, will want cell idx in .obs, so use that here instead of cell name

# create df with cell indices
cell_idx_df = cd138_adata.obs[['person']].reset_index().rename(columns={'index':'cell'}).drop_duplicates()
cell_idx_df['cell_idx'] = np.arange(len(cd138_adata.obs))

# merge
cnv_array = cell_idx_df.merge(cnv_genes[['sid','gene','dose']].drop_duplicates(), left_on="person", right_on="sid", how="inner")
cnv_array

In [None]:
#drop person id which we dont need since CNVs will be used on cell level
cnv_array = cnv_array.drop(columns=['person','sid','cell'])
cnv_array

In [None]:
#convert genes to gene_idx
cd138_adata.var['gene_idx'] = np.arange(len(cd138_adata.var))

#merge
cnv_array = cd138_adata.var[['gene_idx']].reset_index().rename(columns={'index':'gene'}).merge(cnv_array, on="gene", how="inner")

In [None]:
cnv_array.drop(columns="gene", inplace=True)

In [None]:
# create sparse matrix
from scipy.sparse import coo_array, csr_array

cnv_sparse = csr_array((cnv_array.dose, (cnv_array.cell_idx,cnv_array.gene_idx)), shape=cd138_adata.shape)

In [None]:
cnv_sparse

In [None]:
# change default value '0' t0 '1' 
cnv_sparse.data = cnv_sparse.data-1
cnv_array = cnv_sparse.todense() + 1

In [None]:
#pickle formatted infercnv results

# open a file, where you want to store the data
file = open('outputs/cnv_array_cd138.pkl', 'wb')

# dump information to that file
pickle.dump(cnv_array, file)

# close the file
file.close()

!gsutil mv outputs/cnv_array_cd138.pkl gs://rebecca-summer23/outputs/cnv_array_cd138.pkl

In [None]:
# read in pickled results
!gsutil cp gs://rebecca-summer23/outputs/cnv_array_cd138.pkl /tmp

In [None]:
file = open('/tmp/cnv_array_cd138.pkl', 'rb')

# dump information to that file
cnv_array = pickle.load(file)

# close the file
file.close()

## run PCA on CNVs and pass as continuous covariate

In [None]:
cnv_df = pd.DataFrame(cnv_array, index=cd138_adata.obs.index, columns = cd138_adata.var.index)

In [None]:
cnv_dense = cnv_df.loc[:,~np.all(cnv_df==1, axis=0)]

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
cnv_pc = pca.fit_transform(cnv_dense)

In [None]:
#frac explained variance
import matplotlib.pyplot as plt
plt.plot(np.arange(20)+1, pca.explained_variance_ratio_)

In [None]:
cnv_pc_df = pd.DataFrame(cnv_pc, index=cd138_adata.obs.index, columns = ["cnv_PC"+str(i) for i in np.arange(20)+1])

In [None]:
cd138_adata.obs = cd138_adata.obs.merge(cnv_pc_df, right_index=True, left_index=True)

In [None]:
cd138_adata_hvg = cd138_adata[:,cd138_adata.var.highly_variable].copy()

In [None]:
scvi.model.SCVI.setup_anndata(cd138_adata_hvg, layer="counts", continuous_covariate_keys=["cnv_PC"+str(i) for i in np.arange(5)+1])

In [None]:
vae = scvi.model.SCVI(cd138_adata_hvg, gene_likelihood='nb')

In [None]:
vae.train()

In [None]:
#pickle 

# open a file, where you want to store the data
file = open('outputs/scvi_contcnv_cd138_morehvg.pkl', 'wb')

# dump information to that file
pickle.dump(vae, file)

# close the file
file.close()

#!gsutil mv outputs/cnv_array_cd138.pkl gs://rebecca-summer23/outputs/cnv_array_cd138.pkl

In [None]:
cd138_adata.obsm["X_scvi_contcnv"] = vae.get_latent_representation()

In [None]:
sc.pp.neighbors(cd138_adata, use_rep="X_scvi_contcnv")
sc.tl.umap(cd138_adata)
sc.pl.umap(cd138_adata, color=["person","disease_stage"])

In [None]:
sc.pl.umap(cd138_adata, color=['logW3', 'logW4', 'logW5', 'logW8', 'logW9', 'logW11', 'logW16', 'logW20', 'logW24', 'logW28'], ncols=3)

### Run hotspot analysis on this latent space -- why are the healthies and diseased blobbing together?

### Why are some healthy samples being split in two?

## run PCA on CNVs and pass as continuous covariate

In [None]:
cnv_df = pd.DataFrame(cnv_array, index=cd138_adata.obs.index, columns = cd138_adata.var.index)

In [None]:
cnv_dense = cnv_df.loc[:,~np.all(cnv_df==1, axis=0)]

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
cnv_pc = pca.fit_transform(cnv_dense)

In [None]:
#frac explained variance
import matplotlib.pyplot as plt
plt.plot(np.arange(20)+1, pca.explained_variance_ratio_)

In [None]:
cnv_pc_df = pd.DataFrame(cnv_pc, index=cd138_adata.obs.index, columns = ["cnv_PC"+str(i) for i in np.arange(20)+1])

In [None]:
cd138_adata.obs = cd138_adata.obs.merge(cnv_pc_df, right_index=True, left_index=True)

In [None]:
scvi.model.SCVI.setup_anndata(cd138_adata, layer="counts", continuous_covariate_keys=["cnv_PC"+str(i) for i in np.arange(5)+1])

In [None]:
vae = scvi.model.SCVI(cd138_adata, gene_likelihood='nb')

In [None]:
vae.train()

In [None]:
#pickle 

# open a file, where you want to store the data
file = open('outputs/scvi_contcnv_cd138_allgenes.pkl', 'wb')

# dump information to that file
pickle.dump(vae, file)

# close the file
file.close()

!gsutil mv outputs/scvi_contcnv_cd138_allgenes.pkl gs://rebecca-summer23/outputs/scvi_contcnv_cd138_allgenes.pkl

In [None]:
!gsutil cp gs://rebecca-summer23/outputs/scvi_contcnv_cd138_allgenes.pkl /tmp/scvi_contcnv_cd138_allgenes.pkl 

In [None]:
#pickle 

# open a file, where you want to store the data
file = open('tmp/scvi_contcnv_cd138_allgenes.pkl', 'rb')

# dump information to that file
vae_cnvcont_allgenes = pickle.load(file)

# close the file
file.close()

In [None]:
#pickle 

# open a file, where you want to store the data
file = open('tmp/scvi_contcnv_cd138_allgenes.pkl', 'rb')

# dump information to that file
vae_cnvcont_allgenes = pickle.load(file)

# close the file
file.close()

In [None]:
cd138_adata.obsm["X_scvi_contcnv_allgenes"] = vae.get_latent_representation()

In [None]:
cd138_adata

In [None]:
sc.pp.neighbors(cd138_adata, use_rep="X_scvi_contcnv_allgenes")
sc.tl.umap(cd138_adata)
sc.pl.umap(cd138_adata, color=["person","disease_stage"])

In [None]:
sc.pp.neighbors(cd138_adata, use_rep="X_scvi_contcnv_allgenes")
sc.tl.umap(cd138_adata)
sc.pl.umap(cd138_adata, color=["person","disease_stage"])

In [None]:
sc.pl.umap(cd138_adata, color=['logW3', 'logW4', 'logW5', 'logW8', 'logW9', 'logW11', 'logW16', 'logW20', 'logW24', 'logW28'], ncols=3)

## run scvi with dosage correction passing in cnv matrix

In [None]:
cd138_adata.obsm['cnv'] = cnv_array

In [None]:
scvi.model.SCVI.setup_anndata(cd138_adata, layer="counts", cnv_key="cnv")

In [None]:
vae = scvi.model.SCVI(cd138_adata, gene_likelihood='nb')

In [None]:
vae.train()

In [None]:
cd138_adata.obsm["X_scvi_cnv"] = vae.get_latent_representation()

In [None]:
# passing in cnv -- not qualitatively different
sc.pp.neighbors(cd138_adata, use_rep="X_scvi_cnv")
sc.tl.umap(cd138_adata)
sc.pl.umap(cd138_adata, color="person")

In [None]:
sc.pl.umap(cd138_adata, color="disease_stage")

In [None]:
#pickle scvi results

# open a file, where you ant to store the data
file = open('outputs/cnv_dosage_scvi_cd138.pkl', 'wb')

# dump information to that file
pickle.dump(vae, file)

# close the file
file.close()


In [None]:
!gsutil mv outputs/cnv_dosage_scvi_cd138.pkl gs://rebecca-summer23/outputs/cnv_dosage_scvi_cd138.pkl