In [None]:
import numpy as np
import torch
import torch_geometric.nn
import torch_geometric.data as data
from torch_geometric.utils.convert import to_networkx

import torch.nn as nn

import scanpy as sc
import numpy as np
import pandas as pd

def sigmoid(x):
    return 1/(1+np.exp(-x))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from torch_geometric.nn import TransformerConv

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

In [None]:
set_seed(0)

# Here we use heart as one example

In [None]:
# for specific encoder/decoder
# tissue_list = { 
#                "heart":[233, 676, 783, 947,266, 223, 233, 978, 928, 852, 839, 733]}


tissue_list = { 
               "scrna_heart":['D4',
 'H2',
 'H3',
 'D6',
 'D2',
 'H7',
 'D11',
 'D3',
 'D1',
 'D5',
 'H4',
 'D7',
 'H6',
 'H5',
 'G19'], 
}

# construct graph batch
# based on simulation results
graph_list = []
cor_list = []
label_list = []
count = 0

for tissue in tissue_list.keys():
    for i in tissue_list[tissue]:
        print(i)
        pathway_count = f"./heart_atlas/{tissue}_" + i + "_rna_expression" + ".csv"
        pathway_matrix = f"./heart_atlas/{tissue}_" + i + "_pvalue" + ".csv"

        pd_adata_new =  pd.read_csv(pathway_count, index_col=0)
        correlation = pd.read_csv(pathway_matrix, index_col=0)
        cor_list.append(correlation)

        print(correlation.shape)
        print(pd_adata_new.shape)
        adata = sc.AnnData(pd_adata_new)

        adata_new = adata.copy()
        edges_new = np.array([np.nonzero(correlation.values)[0],np.nonzero(correlation.values)[1]])
        graph = data.Data(x=torch.FloatTensor(adata_new.X.copy()), edge_index=torch.FloatTensor(edges_new).long())

        vis = to_networkx(graph)
        graph.gene_list = pd_adata_new.index
        graph.show_index = tissue +"__" + str(i)

        graph_list.append(graph)
        label_list.append(tissue)
        
        count +=1

# PCA

In [None]:
graph_list

In [None]:
emb_list = []
gene_list = []
tissue_list = []

In [None]:
len(tissue_list)

In [None]:
for i in range(0,len(graph_list)):
    graph = graph_list[i]
    adata = sc.AnnData(graph.x.cpu().numpy())
    sc.pp.scale(adata)
    sc.tl.pca(adata, 32)

    emb_list.append(adata.obsm['X_pca'])

    gene_list.append(graph.gene_list)
    tissue_list.append([graph.show_index for j in range(len(graph.x))])

In [None]:
adata = sc.AnnData(np.concatenate(emb_list))

In [None]:
adata

In [None]:
adata.obs['gene'] = np.concatenate(gene_list)
adata.obs['tissue'] = np.concatenate(tissue_list)

In [None]:
adata.obs['tissue']

In [None]:
sc.pp.neighbors(adata, use_rep='X')
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color='tissue')

In [None]:
sc.tl.leiden(adata)

In [None]:
sc.pl.umap(adata, color='leiden')

In [None]:
adata.obs['tissue_new'] = [i.split("__")[0] for i in adata.obs['tissue']]

In [None]:
sc.pl.umap(adata, color='tissue_new')

In [None]:
adata.write_h5ad("heart_global/heart_umi_PCA.h5ad")

# Gene2vec

In [None]:
import numpy as np
from torch_geometric.utils.convert import to_networkx

import numpy as np

import scanpy as sc
import numpy as np


def sigmoid(x):
    return 1/(1+np.exp(-x))

In [None]:
import gensim

In [None]:
####training parameters########
dimension = 32  # dimension of the embedding
num_workers = 32  # number of worker threads
sg = 1  # sg =1, skip-gram, sg =0, CBOW
max_iter = 10  # number of iterations
window_size = 1  # The maximum distance between the gene and predicted gene within a gene list
txtOutput = True

In [None]:
# model = gensim.models.Word2Vec(gene_pairs, vector_size=dimension, window=window_size, min_count=1, workers=num_workers,sg=sg, )
# model.train(gene_pairs,total_examples=model.corpus_count,epochs=max_iter)

In [None]:
# vector = model.wv['ENSG00000158747.15'] 

In [None]:
# vector_list = np.zeros((1000,32))
# gene_list = []
# for num,i in enumerate(edge_list.index):
#     vector_list[num] = model.wv[i] 
#     gene_list.append(i)

In [None]:
# gene_list

In [None]:
def generate_list(model, edge_list):
    vector_list = np.zeros((1000,32))
    gene_list = []
    for num,i in enumerate(edge_list.index):
        vector_list[num] = model.wv[i] 
        gene_list.append(i)
    
    return vector_list, gene_list

In [None]:
vec_list = []
gene_list_final = []

for num,i in enumerate(graph_list):
    edge_list = cor_list[num]
    
    nonz_index = np.nonzero(edge_list.values)
    
    gene_pairs = []
    for i,j in zip(nonz_index[0], nonz_index[1]):
        gene_pairs.append([edge_list.index[i], edge_list.columns[j]])
        
    model = gensim.models.Word2Vec(gene_pairs, vector_size=dimension, window=window_size, min_count=1, workers=num_workers,sg=sg, )
    model.train(gene_pairs,total_examples=model.corpus_count,epochs=max_iter)
    
    print('finish gene2vec training')
    vector_list = np.zeros((1000,32))
    gene_list = []
    for num,i in enumerate(edge_list.index):
        vector_list[num] = model.wv[i] 
        gene_list.append(i)
        
    vec_list.append(vector_list)
    gene_list_final.append(gene_list)
    

In [None]:
tissue_list = []
for graph in graph_list:
    label_list = [graph.show_index for i in range(len(graph.x))]
    tissue_list.append(label_list)

In [None]:
np.concatenate(np.array(tissue_list))

In [None]:
adata = sc.AnnData(np.concatenate(np.array(vec_list)))

In [None]:
adata.obs['tissue'] = np.concatenate(np.array(tissue_list))

In [None]:
adata.obs['gene'] = np.concatenate(np.array(gene_list_final))

In [None]:
sc.pp.neighbors(adata, use_rep='X')
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata)

In [None]:
sc.pl.umap(adata, color='leiden')

In [None]:
sc.pl.umap(adata, color='tissue')

In [None]:
adata.write_h5ad('heart_global/heart_umi_gene2vec.h5ad')

# scBERT

In [None]:
# Please see the codes of scBERT
# https://github.com/TencentAILabHealthcare/scBERT

# GIANT

In [None]:
# Please see the codes of GIANT
# https://github.com/chenhcs/GIANT

# GAE/VGAE/MAE/WSMAE

In [None]:
# Please see the seperated file for {method} benchmark.py

# SUGRL

In [None]:
# Please see the codes of SUGRL
# https://github.com/YujieMo/SUGRL

# GPS

In [None]:
# Please see the codes of GPS
# https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.GPSConv.html#torch_geometric.nn.conv.GPSConv

# Graphormer

In [None]:
# Please see the codes of Graphormer
# https://github.com/microsoft/Graphormer