In [1]:
# important, you need to install scvelo

import os
import numpy as np
import pandas as pd
import scanpy as sc
import scvelo as scv
import matplotlib.pyplot as plt
import magic
import anndata as ad
from scipy.stats import spearmanr
from itertools import product

In [2]:
path_cwd=os.getcwd()
path_train_data=path_cwd + '/ML4G_Project_2_Data/train_data'
path_test_data=path_cwd + '/ML4G_Project_2_Data/test_data'

In [3]:
#train_data
bulk_train=pd.read_csv(path_train_data +'/pancreas_bulk_train.csv',index_col=0)
metadata_train=pd.read_csv(path_train_data+ '/pancreas_sc_metadata_train.csv')
sc_train=pd.read_csv(path_train_data +'/pancreas_sc_train.csv', index_col=0)

#test_data
metadata_test=pd.read_csv(path_test_data+ '/pancreas_sc_metadata_test_wocelltype.csv')
sc_test=pd.read_csv(path_test_data +'/pancreas_sc_test.csv', index_col=0)

In [4]:
cells_1=[column for column in sc_train.columns if 'patient1' in column ]
cells_2=[column for column in sc_train.columns if 'patient2' in column ]
cells_3=[column for column in sc_train.columns if 'patient3' in column ]
cells_4=[column for column in sc_train.columns if 'patient4' in column ]

# we transpose the dataframe since we want a cells x genes dataframe for applying MAGIC
sc1_train=sc_train[cells_1].T
sc2_train=sc_train[cells_2].T
sc3_train=sc_train[cells_3].T
sc4_train=sc_train[cells_4].T
sc1_train.head(3)

Unnamed: 0,SGIP1,AZIN2,CLIC4,AGBL4,NECAP2,SLC45A1,TGFBR3,DBT,RFWD2,C1orf21,...,LOC389831,MGC70870,KIR2DS5,KIR2DL5A,KIR3DS1,KIR2DL5B,KIR2DS2,KIR2DS1,KIR2DL2,KIR2DS3
patient1_A10,0.0,0.0,127.0,0.0,140.0,0.0,0.0,1.0,103.0,267.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
patient1_A12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
patient1_A13,0.0,110.0,0.0,21.0,0.0,0.0,0.0,56.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


What we should avoid is that an experiment was more sensitive that another (regarding two patients), but this effect is canceled normalizing by total counts each patient as we did, hence we do not need other normalizations (which would flat variability which is present for biological reasons)

In [5]:
# Pre processing: Normalize per total reads per cell and transform with log
def pp(data):
    data=ad.AnnData(data) # needed to use following functions
    scv.pp.normalize_per_cell(data)
    scv.pp.log1p(data)
    data=pd.DataFrame(data.X, columns = data.var_names,index=data.obs_names)
    return data

patients=[sc1_train,sc2_train,sc3_train,sc4_train]
patients_pp=[]
#after the loop variables are called data1,data2,data3,data4 or are in the patients_pp list
for i,data in enumerate(patients):
    pp_data=pp(data)
    vars().__setitem__('data' + str(i+1), pp_data )
    patients_pp += [pp_data]


data1.head(5)

Normalized count data: X.
Normalized count data: X.
Normalized count data: X.
Normalized count data: X.


Unnamed: 0,SGIP1,AZIN2,CLIC4,AGBL4,NECAP2,SLC45A1,TGFBR3,DBT,RFWD2,C1orf21,...,LOC389831,MGC70870,KIR2DS5,KIR2DL5A,KIR3DS1,KIR2DL5B,KIR2DS2,KIR2DS1,KIR2DL2,KIR2DS3
patient1_A10,0.0,0.0,4.584115,0.0,4.680622,0.0,0.0,0.567086,4.377034,5.321808,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
patient1_A12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.694045,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
patient1_A13,0.0,4.803261,0.0,3.181477,0.0,0.0,0.0,4.136011,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
patient1_A15,2.512689,0.0,3.997298,0.0,5.627522,0.0,0.0,5.113262,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
patient1_A17,0.0,1.220985,0.0,0.0,5.342219,0.0,0.0,0.0,2.100662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
k_set, t_set , n_pca_set= [10,30,60], ['auto', 3,5,10], [100,500,1000]
results=[]
for k,t, n_pca in product(k_set,t_set,n_pca_set):
    result=0
    for i,patient in enumerate(patients_pp):
        magic_op = magic.MAGIC(verbose=0,knn=k,t=t,n_pca=n_pca)
        imputed = magic_op.fit_transform(patient)
        pred=imputed.mean(axis=0)
        bulk=bulk_train['patient' + str(i+1)]
        result+=spearmanr(pred,bulk).statistic/4
    results+=[result]

In [9]:
baseline=np.mean([spearmanr(patients_pp[i].mean(axis=0),bulk_train['patient' + str(i+1)]).statistic for i in range(4)])
print(f'Correlation before imputation: {baseline}')
print(f'Best Correlation: {max(results)}')

Correlation before imputation: 0.933968744579174
Best Correlation: 0.9330516106236137
