In [1]:
import numpy as np
import pandas as pd
import scanpy, scipy
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
with open('../data/processed_files/5_primate_parathyroid_cells_all_genes_batch_corrected.npz', 'rb') as f:
    npzfile = np.load(f, allow_pickle=True)
    primate_magan_data = npzfile['data']
    primate_genes = npzfile['genes']
    
primate_magan_data = pd.DataFrame(primate_magan_data, columns=primate_genes)
primate_magan_data = primate_magan_data.iloc[:, ~primate_magan_data.columns.duplicated()]

In [3]:
df_human = scanpy.read_h5ad('../data/processed_files/5_human_parathyroid_cells_all_genes_batch_corrected.h5ad')

In [4]:
with open('../data/processed_files/6_scMMGAN_human_to_primate.npz', 'rb') as f:
    npzfile = np.load(f)
    human_magan_data = npzfile['human_to_primate']
    
human_magan_data = pd.DataFrame(human_magan_data, index=df_human.obs_names, columns=primate_genes)
human_magan_data = human_magan_data.iloc[:, ~human_magan_data.columns.duplicated()]

In [5]:
data_ref = pd.DataFrame(df_human.to_df().values, columns=df_human.var['PrimateEnsemblID'].tolist(), index=df_human.obs_names)
data_ref = data_ref.loc[:, data_ref.sum(axis=0) != 0]
genes = np.array(list(primate_magan_data.columns))

intersection_genes = np.array(list(set(genes).intersection(data_ref.columns)))
data_ref[intersection_genes] = (data_ref[intersection_genes] - data_ref[intersection_genes].mean(axis=0) + primate_magan_data[intersection_genes].mean(axis=0))

### scMMGAN Test set metrics (per sample)

In [14]:
primate_magan_data = primate_magan_data.loc[:, primate_magan_data.sum(axis=0) != 0]
intersection_genes = np.array(list(set(primate_magan_data.columns).intersection(data_ref.columns)))

In [13]:
correlation_df = pd.DataFrame(index=range(10), columns=['total cell pearson', 'total gene pearson',
                                                        'Y7 cell pearson', 'Y7 gene pearson', 
                                                        'Y9 cell pearson', 'Y9 gene pearson',
                                                        'Y11 cell pearson', 'Y11 gene pearson',
                                                        'Y13 cell pearson', 'Y13 gene pearson'])

In [None]:
for i in range(10):
    npzfile = np.load(f'../data/processed_files/6_scMMGAN_human_to_primate_test_{i}.npz')
    test_cells = np.load('../data/mapping_test_runs.npz', allow_pickle=True)['test_runs'][i]
    train_cells = list(set(data_ref.index) - set(test_cells))
    human_magan_train = pd.DataFrame(npzfile['human_to_primate'], index=train_cells, columns=primate_magan_data.columns)
    human_magan_test = pd.DataFrame(npzfile['human_to_primate_test'], index=test_cells, columns=primate_magan_data.columns)

    correlations = []
    for cell in test_cells:
        correlations.append(scipy.stats.pearsonr(data_ref.loc[cell, intersection_genes],
                                                 human_magan_test.loc[cell, intersection_genes]).correlation)
    correlation_df.loc[i, 'total cell pearson'] = np.nanmean(correlations)
    
    correlations = []
    for gene in intersection_genes:
        correlations.append(scipy.stats.pearsonr(data_ref.loc[test_cells, gene],
                                                 human_magan_test.loc[test_cells, gene]).correlation)
    
    correlation_df.loc[i, 'total gene pearson'] = np.nanmean(correlations)

    for sample in ['Y7', 'Y9', 'Y11', 'Y13']:
        test_sample = data_ref.loc[(df_human.obs['sample'] == sample) & (data_ref.index.isin(test_cells))].index
        correlations = []
        for cell in test_sample:
            correlations.append(scipy.stats.pearsonr(data_ref.loc[cell, intersection_genes],
                                                     human_magan_test.loc[cell, intersection_genes]).correlation)
        correlation_df.loc[i, f'{sample} cell pearson'] = np.nanmean(correlations)
        
        correlations = []
        for gene in intersection_genes:
            correlations.append(scipy.stats.pearsonr(data_ref.loc[test_sample, gene],
                                                     human_magan_test.loc[test_sample, gene]).correlation)
        
        correlation_df.loc[i, f'{sample} gene pearson'] = np.nanmean(correlations)

    correlation_df.to_csv('results/human_parathyroid/scmmgan_correlation_pearsonr.csv')

### CycleGAN Test set metrics (per sample)

In [9]:
primate_magan_data = primate_magan_data.loc[:, primate_magan_data.sum(axis=0) != 0]

In [10]:
intersection_genes = np.array(list(set(primate_magan_data.columns).intersection(data_ref.columns)))

In [11]:
correlation_df = pd.DataFrame(index=range(10), columns=['total cell pearson', 'total gene pearson',
                                                        'Y7 cell pearson', 'Y7 gene pearson', 
                                                        'Y9 cell pearson', 'Y9 gene pearson',
                                                        'Y11 cell pearson', 'Y11 gene pearson',
                                                        'Y13 cell pearson', 'Y13 gene pearson'])

In [None]:
for i in range(10):
    npzfile = np.load(f'../data/processed_files/6_cyclegan_human_to_primate_test_{i}.npz')
    test_cells = np.load('../data/mapping_test_runs.npz', allow_pickle=True)['test_runs'][i]
    train_cells = list(set(data_ref.index) - set(test_cells))
    human_magan_train = pd.DataFrame(npzfile['human_to_primate'], index=train_cells, columns=primate_magan_data.columns)
    human_magan_test = pd.DataFrame(npzfile['human_to_primate_test'], index=test_cells, columns=primate_magan_data.columns)

    correlations = []
    for cell in test_cells:
        correlations.append(scipy.stats.pearsonr(data_ref.loc[cell, intersection_genes],
                                                 human_magan_test.loc[cell, intersection_genes]).correlation)
    correlation_df.loc[i, 'total cell pearson'] = np.nanmean(correlations)
    
    correlations = []
    for gene in intersection_genes:
        correlations.append(scipy.stats.pearsonr(data_ref.loc[test_cells, gene],
                                                 human_magan_test.loc[test_cells, gene]).correlation)
    
    correlation_df.loc[i, 'total gene pearson'] = np.nanmean(correlations)

    for sample in ['Y7', 'Y9', 'Y11', 'Y13']:
        test_sample = data_ref.loc[(df_human.obs['sample'] == sample) & (data_ref.index.isin(test_cells))].index
        correlations = []
        for cell in test_sample:
            correlations.append(scipy.stats.pearsonr(data_ref.loc[cell, intersection_genes],
                                                     human_magan_test.loc[cell, intersection_genes]).correlation)
        correlation_df.loc[i, f'{sample} cell pearson'] = np.nanmean(correlations)
        
        correlations = []
        for gene in intersection_genes:
            correlations.append(scipy.stats.pearsonr(data_ref.loc[test_sample, gene],
                                                     human_magan_test.loc[test_sample, gene]).correlation)
        
        correlation_df.loc[i, f'{sample} gene pearson'] = np.nanmean(correlations)

    correlation_df.to_csv('results/human_parathyroid/cyclegan_correlation_pearsonr.csv')