In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from scipy.stats import spearmanr
import scanpy.external as sce
from itertools import product

In [2]:
path_cwd=os.getcwd()
path_train_data=path_cwd + '/ML4G_Project_2_Data/train_data'
path_test_data=path_cwd + '/ML4G_Project_2_Data/test_data'
#train_data

bulk_train=pd.read_csv(path_train_data +'/pancreas_bulk_train.csv',index_col=0)
metadata_train=pd.read_csv(path_train_data+ '/pancreas_sc_metadata_train.csv')
sc_train=pd.read_csv(path_train_data +'/pancreas_sc_train.csv', index_col=0)

#test_data
metadata_test=pd.read_csv(path_test_data+ '/pancreas_sc_metadata_test_wocelltype.csv')
sc_test=pd.read_csv(path_test_data +'/pancreas_sc_test.csv', index_col=0)

cells_1 = [column for column in sc_train.columns if 'patient1' in column ]
cells_2 = [column for column in sc_train.columns if 'patient2' in column ]
cells_3 = [column for column in sc_train.columns if 'patient3' in column ]
cells_4 = [column for column in sc_train.columns if 'patient4' in column ]

# we transpose the dataframe since we want a cells x genes dataframe for applying MAGIC
sc1_train=sc_train[cells_1].T
sc2_train=sc_train[cells_2].T
sc3_train=sc_train[cells_3].T
sc4_train=sc_train[cells_4].T

# we aggregate the dataframes in a list
patients=[sc1_train,sc2_train,sc3_train,sc4_train]

# we extract genes that are not constant across cells
genes1=list(sc1_train.loc[:, (sc1_train != sc1_train.iloc[0]).any()].columns)
genes2=list(sc2_train.loc[:, (sc2_train != sc2_train.iloc[0]).any()].columns)
genes3=list(sc3_train.loc[:, (sc3_train != sc3_train.iloc[0]).any()].columns)
genes4=list(sc4_train.loc[:, (sc4_train != sc4_train.iloc[0]).any()].columns)

genes=[genes1,genes2,genes3,genes4]

In [8]:
# Pre processing: Normalize per total reads per cell and transform with square root

# Helper function to pre process data
def pp(data):
    data=ad.AnnData(data) # needed to use following functions
    #sc.pp.filter_cells(data,min_counts=1)
    sc.pp.normalize_total(data)
    sc.pp.sqrt(data)
    data=pd.DataFrame(data.X, columns = data.var_names, index=data.obs_names)
    return data

# we preprocess the data for each patient
patients_pp=[]
for i,data in enumerate(patients):
    pp_data=pp(data)
    patients_pp += [pp_data]

### Using the whole train dataset

In [9]:
# concatenating the datasets
data_complete=pd.concat(patients_pp,axis=0)
genes=list(data_complete.loc[:, (data_complete != data_complete.iloc[0]).any()].columns)
lens=[len(x) for x in patients_pp]
indexes=[np.sum(lens[:i+1]) for i in range(4)]
results=[]

# hyperparameter search space for magic
k_set ,t_set, n_pca_set= [60,80,90,100], ['auto'] ,[175,180,185]
best_params=0
best_score=0

# imputation between single cell data and bulk data (baseline)
baseline=np.mean([spearmanr(patients_pp[i].mean(axis=0),bulk_train['patient' + str(i+1)]).statistic for i in range(4)])

# MAGIC imputation with hyperparameter search
for k,t,n_pca in product(k_set,t_set,n_pca_set):
    imputed_global=data_complete.copy()
    imputed_local=sce.pp.magic(ad.AnnData(data_complete.loc[:,genes]),knn=k,t=t,n_pca=n_pca,n_jobs=8,copy=True,verbose=0,solver='approximate',random_state=7)
    imputed_global.loc[:,genes]=imputed_local.X
    result=0
    for i,index in enumerate(indexes):
        if i==0: sc_imputed= imputed_global.iloc[:index,:]
        else: sc_imputed= imputed_global.iloc[indexes[i-1]:index,:]
        bulkified=sc_imputed.mean(axis=0)
        bulk=bulk_train['patient' + str(i+1)]
        result+=spearmanr(bulkified,bulk).statistic/4
    if result >= best_score:
        best_params=[k,t,n_pca]
        best_score=result
    results+=[result]
    print(f'parameters: {k,t,n_pca}')
    print(f'Correlation After imputation: {result}\n--------')


print(f'Correlation before imputation: {baseline}')
print(f'BEST CORRELATION AFTER IMPUTATION: {np.max(results)}')
print(f'best parameters: {best_params}')


parameters: (60, 'auto', 175)
Correlation After imputation: 0.9470659901458889
--------
parameters: (60, 'auto', 180)
Correlation After imputation: 0.9470291760147787
--------
parameters: (60, 'auto', 185)
Correlation After imputation: 0.9470344100812496
--------
parameters: (80, 'auto', 175)
Correlation After imputation: 0.9473971744862868
--------
parameters: (80, 'auto', 180)
Correlation After imputation: 0.9473676454934784
--------
parameters: (80, 'auto', 185)
Correlation After imputation: 0.9473711161147803
--------
parameters: (90, 'auto', 175)
Correlation After imputation: 0.9474921757883071
--------
parameters: (90, 'auto', 180)
Correlation After imputation: 0.9474628878062937
--------
parameters: (90, 'auto', 185)
Correlation After imputation: 0.9474705820892478
--------
parameters: (100, 'auto', 175)
Correlation After imputation: 0.9475472372512046
--------
parameters: (100, 'auto', 180)
Correlation After imputation: 0.947519650337976
--------
parameters: (100, 'auto', 185)


### Differentiating by health status

In [11]:
# we apply the same procedure treating healthy and sick patients separately


data_complete_healthy=pd.concat(patients_pp[1:3],axis=0)
data_complete_sick=pd.concat([patients_pp[0],patients_pp[3]],axis=0)
genes_healthy=list(data_complete_healthy.loc[:, (data_complete_healthy != data_complete_healthy.iloc[0]).any()].columns)
genes_sick=list(data_complete_sick.loc[:, (data_complete_sick != data_complete_sick.iloc[0]).any()].columns)

index_healthy=len(patients_pp[1])
index_sick=len(patients_pp[0])


imputed_global=data_complete_healthy.copy()
imputed_local=sce.pp.magic(ad.AnnData(data_complete_healthy.loc[:,genes_healthy]),knn=80,t='auto',n_pca=175,n_jobs=8,copy=True,verbose=0,solver='approximate')
imputed_global.loc[:,genes_healthy]=imputed_local.X
result=0
sc_imputed1= imputed_global.iloc[index_healthy:,:]
sc_imputed2= imputed_global.iloc[:index_healthy,:]
bulkified1=sc_imputed1.mean(axis=0)
bulkified2=sc_imputed2.mean(axis=0)
bulk1=bulk_train['patient' + str(2)]
bulk2=bulk_train['patient' + str(3)]

result=(spearmanr(bulkified1,bulk1).statistic + spearmanr(bulkified2,bulk2).statistic)/4
result


imputed_global=data_complete_sick.copy()
imputed_local=sce.pp.magic(ad.AnnData(data_complete_sick.loc[:,genes_sick]),knn=80,t='auto',n_pca=175,n_jobs=8,copy=True,verbose=0,solver='approximate')
imputed_global.loc[:,genes_sick]=imputed_local.X
sc_imputed1= imputed_global.iloc[index_sick:,:]
sc_imputed2= imputed_global.iloc[:index_sick,:]
bulkified1=sc_imputed1.mean(axis=0)
bulkified2=sc_imputed2.mean(axis=0)
bulk1=bulk_train['patient' + str(1)]
bulk2=bulk_train['patient' + str(4)]

result+=(spearmanr(bulkified1,bulk1).statistic + spearmanr(bulkified2,bulk2).statistic)/4
result


0.9440258830507289

### Using also unlabelled data 

In [13]:
# we include also unlabelled data in the training set (i.e. patients 5,6,7)

cells_5=[column for column in sc_test.columns if 'patient5' in column ]
cells_6=[column for column in sc_test.columns if 'patient6' in column ]
cells_7=[column for column in sc_test.columns if 'patient7' in column ]


# we transpose the dataframe since we want a cells x genes dataframe for applying MAGIC
sc5_train=sc_test[cells_5].T
sc6_train=sc_test[cells_6].T
sc7_train=sc_test[cells_7].T


patients=[sc1_train,sc2_train,sc3_train,sc4_train,sc5_train,sc6_train,sc7_train]
patients_pp=[]
for i,data in enumerate(patients):
    pp_data=pp(data)
    patients_pp += [pp_data]
data_complete=pd.concat(patients_pp,axis=0)
genes=list(data_complete.loc[:, (data_complete != data_complete.iloc[0]).any()].columns)


lens=[len(x) for x in patients_pp]
indexes=[np.sum(lens[:i+1]) for i in range(4)]
results=[]
k_set ,t_set, n_pca_set= [200, 250], ['auto'] ,[250, 300]
best_params=0
best_score=0

baseline=np.mean([spearmanr(patients_pp[i].mean(axis=0),bulk_train['patient' + str(i+1)]).statistic for i in range(4)])


for k,t,n_pca in product(k_set,t_set,n_pca_set):
    imputed_global=data_complete.copy()
    imputed_local=sce.pp.magic(ad.AnnData(data_complete.loc[:,genes]),knn=k,t=t,n_pca=n_pca,n_jobs=8,copy=True,verbose=0,solver='approximate',random_state=7)
    imputed_global.loc[:,genes]=imputed_local.X
    result=0
    for i,index in enumerate(indexes):
        if i==0: sc_imputed= imputed_global.iloc[:index,:]
        else: sc_imputed= imputed_global.iloc[indexes[i-1]:index,:]
        bulkified=sc_imputed.mean(axis=0)
        bulk=bulk_train['patient' + str(i+1)]
        result+=spearmanr(bulkified,bulk).statistic/4
    if result >= best_score:
        best_params=[k,t,n_pca]
        best_score=result
    results+=[result]
    print(f'parameters: {k,t,n_pca}')
    print(f'Correlation After imputation: {result}\n--------')


print(f'Correlation before imputation: {baseline}')
print(f'BEST CORRELATION AFTER IMPUTATION: {np.max(results)}')
print(f'best parameters: {best_params}')

parameters: (200, 'auto', 250)
Correlation After imputation: 0.9524720550899409
--------
parameters: (200, 'auto', 300)
Correlation After imputation: 0.9524691325148815
--------
parameters: (250, 'auto', 250)
Correlation After imputation: 0.9525459803554612
--------
parameters: (250, 'auto', 300)
Correlation After imputation: 0.9525375689279423
--------
Correlation before imputation: 0.9358073816280966
BEST CORRELATION AFTER IMPUTATION: 0.9525459803554612
best parameters: [250, 'auto', 250]
