In [26]:
import pandas as pd
import scanpy, magic
import numpy as np
from tqdm import tqdm
import sys
sys.path.append('/vast/palmer/home.mccleary/av622/Islet-Dynamics/analysis_agedmice/scib/')
import scib
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import decomposition

## Get our data

In [9]:
adata_beta = scanpy.read_h5ad('../data/beta_run_2.h5ad')

In [10]:
adata_stz = scanpy.read_h5ad('data/stz_beta_ins1_ins2.h5ad')

# WT

In [12]:
adata_beta_wt = adata_beta[adata_beta.obs['samples'] == 'WT']
wt_pc_op = pd.read_pickle('../analysis_archetypes/results/WT_magic_pc_op.pkl')
wt_magic_pc_norm = pd.read_pickle('../analysis_archetypes/results/WT_magic_pc_norm.pkl')

## Get reference

In [5]:
stz = scanpy.read_h5ad('data/stz_beta_ins1_ins2.h5ad')
stz_ctrl = stz[stz.obs['groups_named_broad_cond'] == 'beta_Ctrl']
magic_op = magic.MAGIC(random_state=42, t=20)
stz_ctrl.layers['X_magic'] = magic_op.fit_transform(stz_ctrl.to_df())
ref_x = pd.DataFrame(stz_ctrl.layers['X_magic'], index=stz_ctrl.obs_names, columns=stz_ctrl.var_names)

Calculating MAGIC...
  Running MAGIC on 5600 cells and 18210 genes.
  Calculating graph and diffusion operator...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):


    Calculating PCA...
    Calculated PCA in 4.12 seconds.
    Calculating KNN search...
    Calculated KNN search in 3.24 seconds.
    Calculating affinities...
    Calculated affinities in 3.22 seconds.
  Calculated graph and diffusion operator in 10.69 seconds.
  Running MAGIC with `solver='exact'` on 18210-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...


  with _logger.task("imputation"):


  Calculated imputation in 2.56 seconds.
Calculated MAGIC in 13.40 seconds.


## Get post correction reference

In [28]:
out = np.load('results/output_stz_to_wt_cycle_1_correspondence_lambda_10_corr_correspondence_training_counter_15000.npz')
ctrl_magan = pd.DataFrame(out['stz_to_wt'], index=stz_ctrl.obs_names, columns=adata_beta_wt.var_names)
ctrl_magan_magic_pc = wt_pc_op.transform(ctrl_magan)
ctrl_magan_magic_pc_norm = ctrl_magan_magic_pc / np.std(ctrl_magan_magic_pc[:, 0])

## Batch ASW

In [11]:
evaluation_summary = pd.DataFrame(columns=['pre_post_corr'])

In [17]:
adata_post_corr = scanpy.AnnData(ctrl_magan)
adata_post_all = scanpy.concat((adata_post_corr, adata_beta_wt), label='batch')
adata_post_all.obsm['X_pc'] = np.vstack((ctrl_magan_magic_pc_norm, wt_magic_pc_norm))
adata_post_all.obs['cell_type'] = 'beta cell'

In [18]:
evaluation_summary.loc['Batch ASW'] = scib.me.silhouette_batch(adata_post_all, batch_key='batch', label_key='cell_type', embed='X_pc', )

mean silhouette per group:            silhouette_score
group                      
beta cell          0.915022


## Cosine similarity

In [29]:
ctrl_magan = ctrl_magan.T.set_index(adata_beta_wt.var['names']).T
ctrl_magan = ctrl_magan.iloc[:, ~ctrl_magan.columns.duplicated()]

In [35]:
intersection = list(set(ref_x.columns).intersection(ctrl_magan.columns))
len(intersection)

15553

In [36]:
ref_x = ref_x[intersection].values
post_x = ctrl_magan[intersection].values

In [37]:
pre_post_corr = []
for i in tqdm(range(stz_ctrl.n_obs)):
    pre_post_corr.append(cosine_similarity(ref_x[i].reshape(1,-1), post_x[i].reshape(1,-1))[0][0])

100%|██████████| 5600/5600 [00:02<00:00, 2743.37it/s]


In [38]:
evaluation_summary.loc['Mean cell correlation pre- and post-alignment'] = np.nanmean(pre_post_corr)

In [39]:
stz_ctrl.obs['pre_post_corr'] = pre_post_corr

In [43]:
evaluation_summary = pd.concat((evaluation_summary, stz_ctrl.obs[['AT_committment', 'pre_post_corr']].groupby('AT_committment').mean()))

In [44]:
evaluation_summary.to_csv('results/ctrl_scMMGAN_stats.csv')

# OB/OB

In [46]:
adata_beta_obob = adata_beta[adata_beta.obs['samples'] == 'OB/OB']
obob_pc_op = pd.read_pickle('../analysis_archetypes/results/OB_OB_magic_pc_op.pkl')
obob_magic_pc_norm = pd.read_pickle('../analysis_archetypes/results/OB_OB_magic_pc_norm.pkl')

## Get reference

In [48]:
stz = scanpy.read_h5ad('data/stz_beta_ins1_ins2.h5ad')
stz_treat = stz[stz.obs.treatment == 'Vehicle-STZ']
magic_op = magic.MAGIC(random_state=42, t=20)
stz_treat.layers['X_magic'] = magic_op.fit_transform(stz_treat.to_df())
ref_x = pd.DataFrame(stz_treat.layers['X_magic'], index=stz_treat.obs_names, columns=stz_treat.var_names)

Calculating MAGIC...
  Running MAGIC on 1160 cells and 18210 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):


    Calculated PCA in 2.51 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.13 seconds.
    Calculating affinities...
    Calculated affinities in 0.13 seconds.
  Calculated graph and diffusion operator in 2.79 seconds.
  Running MAGIC with `solver='exact'` on 18210-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.17 seconds.
Calculated MAGIC in 3.00 seconds.


  with _logger.task("imputation"):


## Get post correction reference

In [50]:
out = np.load('results/output_stz_to_obob_cycle_1_correspondence_lambda_10_corr_correspondence_training_counter_15000.npz')
treatment_magan = pd.DataFrame(out['stz_to_obob'], index=stz_treat.obs_names, columns=adata_beta_obob.var_names)
treatment_magan_magic_pc = obob_pc_op.transform(treatment_magan)
treatment_magan_magic_pc_norm = treatment_magan_magic_pc / np.std(treatment_magan_magic_pc[:, 0])

## Batch ASW

In [51]:
evaluation_summary = pd.DataFrame(columns=['pre_post_corr'])

In [55]:
adata_post_corr = scanpy.AnnData(treatment_magan)
adata_post_all = scanpy.concat((adata_post_corr, adata_beta_obob), label='batch')
adata_post_all.obsm['X_pc'] = np.vstack((treatment_magan_magic_pc_norm, obob_magic_pc_norm))
adata_post_all.obs['cell_type'] = 'beta cell'

In [56]:
evaluation_summary.loc['Batch ASW'] = scib.me.silhouette_batch(adata_post_all, batch_key='batch', label_key='cell_type', embed='X_pc', )

mean silhouette per group:            silhouette_score
group                      
beta cell          0.642818


## Cosine similarity

In [57]:
treatment_magan.columns = adata_beta_obob.var['names']

In [58]:
treatment_magan = treatment_magan.iloc[:, ~treatment_magan.columns.duplicated()]

In [59]:
intersection = list(set(ref_x.columns).intersection(treatment_magan.columns))
len(intersection)

15553

In [60]:
ref_x = ref_x[intersection].values
post_x = treatment_magan[intersection].values

In [61]:
pre_post_corr = []
for i in tqdm(range(stz_treat.n_obs)):
    pre_post_corr.append(cosine_similarity(ref_x[i].reshape(1,-1), post_x[i].reshape(1,-1))[0][0])

100%|██████████| 1160/1160 [00:00<00:00, 2802.14it/s]


In [62]:
evaluation_summary.loc['Mean cell correlation pre- and post-alignment'] = np.nanmean(pre_post_corr)

In [63]:
stz_treat.obs['pre_post_corr'] = pre_post_corr

In [64]:
evaluation_summary = pd.concat((evaluation_summary, stz_treat.obs[['AT_committment', 'pre_post_corr']].groupby('AT_committment').mean()))

In [65]:
evaluation_summary.to_csv('results/stz_scMMGAN_stats.csv')

# Run 2

# WT

In [None]:
adata_beta_wt = adata_beta[adata_beta.obs['samples'] == 'WT']
wt_pc_op = pd.read_pickle('../analysis_archetypes/results/WT_magic_pc_op.pkl')
wt_magic_pc_norm = pd.read_pickle('../analysis_archetypes/results/WT_magic_pc_norm.pkl')

## Get reference

In [13]:
stz = scanpy.read_h5ad('data/stz_beta_ins1_ins2.h5ad')
stz_ctrl = stz[stz.obs['groups_named_broad_cond'] == 'beta_Ctrl']
magic_op = magic.MAGIC(random_state=42, t=20)
stz_ctrl.layers['X_magic'] = magic_op.fit_transform(stz_ctrl.to_df())
ref_x = pd.DataFrame(stz_ctrl.layers['X_magic'], index=stz_ctrl.obs_names, columns=stz_ctrl.var_names)

Calculating MAGIC...
  Running MAGIC on 5600 cells and 18210 genes.
  Calculating graph and diffusion operator...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):


    Calculating PCA...
    Calculated PCA in 2.65 seconds.
    Calculating KNN search...
    Calculated KNN search in 3.25 seconds.
    Calculating affinities...
    Calculated affinities in 3.21 seconds.
  Calculated graph and diffusion operator in 9.25 seconds.
  Running MAGIC with `solver='exact'` on 18210-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...


  with _logger.task("imputation"):


  Calculated imputation in 4.94 seconds.
Calculated MAGIC in 14.38 seconds.


## Get post correction reference

In [15]:
adata_beta_wt = adata_beta_wt[:, adata_beta[:, ~adata_beta.var['names'].duplicated()].var.index]

In [27]:
data_magic_op = magic.MAGIC(random_state=42, t=10, verbose=False)
data_magic = data_magic_op.fit_transform(adata_beta_wt.to_df())

data_pc_op = decomposition.PCA(n_components=20, random_state=42)
data_magic_pc = data_pc_op.fit_transform(data_magic)
data_magic_pc_norm = data_magic_pc / np.std(data_magic_pc[:, 0])

In [28]:
out = np.load('results/output_run_2_stz_to_wt_cycle_1_correspondence_lambda_10_mse_correspondence.npz')
ctrl_magan = pd.DataFrame(out['stz_to_wt'], index=stz_ctrl.obs_names, columns=adata_beta_wt.var_names)
ctrl_magan_magic_pc = data_pc_op.transform(ctrl_magan)
ctrl_magan_magic_pc_norm = ctrl_magan_magic_pc / np.std(ctrl_magan_magic_pc[:, 0])

## Batch ASW

In [29]:
evaluation_summary = pd.DataFrame(columns=['pre_post_corr'])

In [30]:
adata_post_corr = scanpy.AnnData(ctrl_magan)
adata_post_all = scanpy.concat((adata_post_corr, adata_beta_wt), label='batch')
adata_post_all.obsm['X_pc'] = np.vstack((ctrl_magan_magic_pc_norm, wt_magic_pc_norm))
adata_post_all.obs['cell_type'] = 'beta cell'

In [31]:
evaluation_summary.loc['Batch ASW'] = scib.me.silhouette_batch(adata_post_all, batch_key='batch', label_key='cell_type', embed='X_pc', )

mean silhouette per group:            silhouette_score
group                      
beta cell          0.909842


## Cosine similarity

In [32]:
ctrl_magan = ctrl_magan.T.set_index(adata_beta_wt.var['names']).T
ctrl_magan = ctrl_magan.iloc[:, ~ctrl_magan.columns.duplicated()]

In [33]:
intersection = list(set(ref_x.columns).intersection(ctrl_magan.columns))
len(intersection)

15553

In [34]:
ref_x = ref_x[intersection].values
post_x = ctrl_magan[intersection].values

In [35]:
pre_post_corr = []
for i in tqdm(range(stz_ctrl.n_obs)):
    pre_post_corr.append(cosine_similarity(ref_x[i].reshape(1,-1), post_x[i].reshape(1,-1))[0][0])

100%|██████████| 5600/5600 [00:02<00:00, 2522.35it/s]


In [37]:
evaluation_summary.loc['Mean cell correlation pre- and post-alignment'] = np.nanmean(pre_post_corr)

In [38]:
stz_ctrl.obs['pre_post_corr'] = pre_post_corr

In [39]:
evaluation_summary = pd.concat((evaluation_summary, stz_ctrl.obs[['AT_committment', 'pre_post_corr']].groupby('AT_committment').mean()))

In [40]:
evaluation_summary.to_csv('results/ctrl_scMMGAN_stats_run_2.csv')

# OB/OB

In [47]:
adata_beta_obob = adata_beta[adata_beta.obs['samples'] == 'OB/OB']
adata_beta_obob = adata_beta_obob[:, adata_beta[:, ~adata_beta.var['names'].duplicated()].var.index]

In [48]:
data_magic_op = magic.MAGIC(random_state=42, t=10, verbose=False)
data_magic = data_magic_op.fit_transform(adata_beta_obob.to_df())

data_pc_op = decomposition.PCA(n_components=20, random_state=42)
data_magic_pc = data_pc_op.fit_transform(data_magic)
data_magic_pc_norm = data_magic_pc / np.std(data_magic_pc[:, 0])

  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):


  Running MAGIC with `solver='exact'` on 16479-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.


  with _logger.task("imputation"):


## Get reference

In [45]:
stz = scanpy.read_h5ad('data/stz_beta_ins1_ins2.h5ad')
stz_treat = stz[stz.obs.treatment == 'Vehicle-STZ']
magic_op = magic.MAGIC(random_state=42, t=20)
stz_treat.layers['X_magic'] = magic_op.fit_transform(stz_treat.to_df())
ref_x = pd.DataFrame(stz_treat.layers['X_magic'], index=stz_treat.obs_names, columns=stz_treat.var_names)

Calculating MAGIC...
  Running MAGIC on 1160 cells and 18210 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):


    Calculated PCA in 0.87 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.15 seconds.
    Calculating affinities...
    Calculated affinities in 0.15 seconds.
  Calculated graph and diffusion operator in 1.21 seconds.
  Running MAGIC with `solver='exact'` on 18210-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...


  with _logger.task("imputation"):


  Calculated imputation in 0.21 seconds.
Calculated MAGIC in 1.47 seconds.


## Get post correction reference

In [51]:
out = np.load('results/output_run_2_stz_to_obob_cycle_1_correspondence_lambda_10_corr_correspondence.npz')
treatment_magan = pd.DataFrame(out['stz_to_obob'], index=stz_treat.obs_names, columns=adata_beta_obob.var_names)
treatment_magan_magic_pc = data_pc_op.transform(treatment_magan)
treatment_magan_magic_pc_norm = treatment_magan_magic_pc / np.std(treatment_magan_magic_pc[:, 0])

## Batch ASW

In [52]:
evaluation_summary = pd.DataFrame(columns=['pre_post_corr'])

In [54]:
adata_post_corr = scanpy.AnnData(treatment_magan)
adata_post_all = scanpy.concat((adata_post_corr, adata_beta_obob), label='batch')
adata_post_all.obsm['X_pc'] = np.vstack((treatment_magan_magic_pc_norm, data_magic_pc_norm))
adata_post_all.obs['cell_type'] = 'beta cell'

In [55]:
evaluation_summary.loc['Batch ASW'] = scib.me.silhouette_batch(adata_post_all, batch_key='batch', label_key='cell_type', embed='X_pc', )

mean silhouette per group:            silhouette_score
group                      
beta cell          0.641901


## Cosine similarity

In [56]:
treatment_magan.columns = adata_beta_obob.var['names']

In [57]:
treatment_magan = treatment_magan.iloc[:, ~treatment_magan.columns.duplicated()]

In [58]:
intersection = list(set(ref_x.columns).intersection(treatment_magan.columns))
len(intersection)

15553

In [59]:
ref_x = ref_x[intersection].values
post_x = treatment_magan[intersection].values

In [60]:
pre_post_corr = []
for i in tqdm(range(stz_treat.n_obs)):
    pre_post_corr.append(cosine_similarity(ref_x[i].reshape(1,-1), post_x[i].reshape(1,-1))[0][0])

100%|██████████| 1160/1160 [00:00<00:00, 2380.01it/s]


In [61]:
evaluation_summary.loc['Mean cell correlation pre- and post-alignment'] = np.nanmean(pre_post_corr)

In [62]:
stz_treat.obs['pre_post_corr'] = pre_post_corr

In [63]:
evaluation_summary = pd.concat((evaluation_summary, stz_treat.obs[['AT_committment', 'pre_post_corr']].groupby('AT_committment').mean()))

In [64]:
evaluation_summary.to_csv('results/stz_scMMGAN_stats_run_2.csv')