In [1]:
import pandas as pd
import scanpy as sc
import numpy as np


In [2]:
"""
@author: wen zhang
This function integrates two single-cell datasets, spatial and scRNA-seq, 
and predictes the expression of the spatially unmeasured genes from the scRNA-seq data.

Parameters
-------
RNA_file : str
    scRNA-seq data count file with Tab-delimited (cells X genes).
Spatial_file : str
    spatial count data file with Tab-delimited, please note that the file has no index.
location_file : str
    spatial spot coordinate file name with Tab-delimited, please note that the file has no index.
device : str
    Option,  ['CPU','GPU'], defaults to 'CPU'
train_gene : list
    genes for integrations, you can support more than one train list.
predict_gene : list
    genes for prediction, you can support more than one test list.
outdir : str
    result file stored direction    
"""

DataDir = 'DataUpload/Dataset4/'
outdir = 'FigureData/Figure2/Dataset4/'
RNA_file = DataDir + 'scRNA_count.txt'
Spatial_file = DataDir + 'Insitu_count.txt'
location_file = DataDir + 'Locations.txt'

RNA_data = pd.read_table(RNA_file, header=0, index_col = 0)
Spatial_data = pd.read_table(Spatial_file, sep = '\t',header = 0)
RNA_data_adata = sc.read(RNA_file, sep = '\t', first_column_names = True).T
Spatial_data_adata = sc.read(Spatial_file, sep = '\t')
locations = np.loadtxt(location_file, skiprows=1)

# train_gene = np.load(DataDir + 'train_list.npy', allow_pickle = True).tolist()
# predict_gene = np.load(DataDir + 'test_list.npy', allow_pickle = True).tolist()

# # device = 'GPU'

# if not os.path.exists(outdir):
#     os.mkdir(outdir)




In [3]:
RNA_data

Unnamed: 0,F1S4_160108_001_A01,F1S4_160108_001_B01,F1S4_160108_001_C01,F1S4_160108_001_D01,F1S4_160108_001_E01,F1S4_160108_001_F01,F1S4_160108_001_G01,F1S4_160108_001_H01,F1S4_160108_002_A01,F1S4_160108_002_B01,...,FYS4_171004_103_F01,FYS4_171004_103_G01,FYS4_171004_103_H01,FYS4_171004_104_A01,FYS4_171004_104_B01,FYS4_171004_104_C01,FYS4_171004_104_D01,FYS4_171004_104_F01,FYS4_171004_104_G01,FYS4_171004_104_H01
X0610005C13Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X0610006L08Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X0610007P14Rik,79,121,89,115,390,61,72,161,118,121,...,82,263,45,88,81,187,99,123,112,37
X0610009B22Rik,140,175,68,0,177,110,149,60,61,114,...,66,136,82,74,104,51,160,245,179,91
X0610009E02Rik,0,1,0,0,0,0,0,0,0,26,...,3,0,0,0,0,0,0,0,25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zzef1,36,1,104,0,0,0,17,6,56,0,...,0,0,0,0,11,0,16,0,54,0
Zzz3,0,29,21,30,9,88,8,14,0,113,...,30,0,0,26,0,70,0,93,0,70
a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
l7Rn6,190,178,153,110,335,178,131,128,41,258,...,121,287,113,94,147,133,271,75,282,0


In [4]:
Spatial_data

Unnamed: 0,Usp40,Pank2,Nrip2,Tmem245,Col6a1,Slc7a4,Adam19,Abcf1,Pip4k2c,Gm1966,...,Lynx1,Trp53i11,Ctgf,Mog,Slc17a7,Cldn11,Gsn,Cnp,Cplx1,Mag
0,2,0,1,0,2,1,1,0,1,5,...,0,0,0,1,19,0,0,0,7,1
1,0,0,0,0,2,0,0,1,0,2,...,1,1,0,1,6,2,0,2,4,1
2,0,0,0,0,0,1,0,1,1,2,...,0,4,2,0,20,0,0,0,7,1
3,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,1,2,31,1
4,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,13,0,0,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
519,0,0,0,0,0,0,2,1,2,0,...,3,0,2,0,15,0,0,2,13,0
520,0,0,0,0,0,0,2,1,1,2,...,12,0,2,0,35,0,0,5,36,1
521,0,0,1,1,2,0,1,1,0,2,...,13,1,0,0,26,0,0,2,28,1
522,0,1,0,0,0,0,0,1,3,0,...,2,1,0,1,31,4,0,3,15,2


In [5]:
locations.shape


(524, 2)

## new data

### RNA

In [6]:
RNA_data_2 = sc.read_h5ad('xenium data/Yao_150kcells_subsample_with_annotations.h5ad')






In [12]:
RNA_data_2.obs

Unnamed: 0.1,sample_name,Unnamed: 0,cl,exp_component_name,donor_label,sex_label,region_label,gene.counts,library_label,platform_label,...,ss_cluster_id,ss_cluster_color,ss_cluster_label,tenx_cluster_id,tenx_cluster_color,tenx_cluster_label,cell_type_accession_id,cell_type_accession_color,cell_type_accession_label,sss
0,10X_cells.AAACCTGAGCCTTGAT-L8TX_180221_01_F09,279039,340,AAACCTGAGCCTTGAT-L8TX_180221_01_F09,371230,M,ACA,3503,SM-DPC4J-01,10X,...,0,,absent,277,#2E8CCB,327_L6 CT CTX,290,#3E98A5,CCN19103010000290,205692
1,10X_cells.AAACCTGCAAGCTGTT-L8TX_180221_01_F09,279041,182,AAACCTGCAAGCTGTT-L8TX_180221_01_F09,371230,M,ACA,4043,SM-DPC4J-01,10X,...,0,,absent,104,#B1EC30,186_L2 IT RSP-ACA,134,#B1EC30,CCN19103010000134,205694
2,10X_cells.AAACCTGCACGTCTCT-L8TX_180221_01_F09,279043,6,AAACCTGCACGTCTCT-L8TX_180221_01_F09,371230,M,ACA,4408,SM-DPC4J-01,10X,...,0,,absent,9,#935F50,3_Lamp5 Lhx6,6,#9E7153,CCN19103010000006,205696
3,10X_cells.AAACCTGGTTGTCGCG-L8TX_180221_01_F09,279045,180,AAACCTGGTTGTCGCG-L8TX_180221_01_F09,371230,M,ACA,5340,SM-DPC4J-01,10X,...,0,,absent,139,#02F970,183_L2/3 IT CTX_1,169,#1AD475,CCN19103010000169,205698
4,10X_cells.AAACCTGGTTTCGCTC-L8TX_180221_01_F09,279046,340,AAACCTGGTTTCGCTC-L8TX_180221_01_F09,371230,M,ACA,4916,SM-DPC4J-01,10X,...,0,,absent,277,#2E8CCB,327_L6 CT CTX,290,#3E98A5,CCN19103010000290,205699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149950,10X_cells.TTTATGCCAAGGTTCT-L8TX_200611_02_A05,204625,173,TTTATGCCAAGGTTCT-L8TX_200611_02_A05,395345,F,TEa-PERI-ECT,4377,SM-G9JI2-5,10X,...,0,,absent,136,#00FF34,178_L2/3 IT CTX,164,#13A23E,CCN19103010000164,131278
149951,10X_cells.TTTGCGCAGCTGAAAT-L8TX_200611_02_A05,204640,192,TTTGCGCAGCTGAAAT-L8TX_200611_02_A05,395345,F,TEa-PERI-ECT,4858,SM-G9JI2-5,10X,...,0,,absent,148,#0DA3A3,193_L4/5 IT CTX,186,#378695,CCN19103010000186,131293
149952,10X_cells.TTTGGTTAGGGTTTCT-L8TX_200611_02_A05,204648,338,TTTGGTTAGGGTTTCT-L8TX_200611_02_A05,395345,F,TEa-PERI-ECT,5994,SM-G9JI2-5,10X,...,0,,absent,277,#2E8CCB,327_L6 CT CTX,288,#37AFAC,CCN19103010000288,131301
149953,10X_cells.TTTGGTTCACCGATAT-L8TX_200611_02_A05,204650,190,TTTGGTTCACCGATAT-L8TX_200611_02_A05,395345,F,TEa-PERI-ECT,4679,SM-G9JI2-5,10X,...,0,,absent,148,#0DA3A3,193_L4/5 IT CTX,184,#0DA3A3,CCN19103010000184,131303


In [8]:
RNA_data_2.obs_names

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '149945', '149946', '149947', '149948', '149949', '149950', '149951',
       '149952', '149953', '149954'],
      dtype='object', length=149955)

In [9]:
RNA_data_2.var_names

Index(['Xkr4', 'Gm1992', 'Gm37381', 'Rp1', 'Sox17', 'Gm37323', 'Mrpl15',
       'Lypla1', 'Gm37988', 'Tcea1',
       ...
       'AC125149.1', 'AC125149.2', 'AC125149.4', 'AC234645.1', 'AC168977.2',
       'AC168977.1', 'AC149090.1', 'CAAA01118383.1', 'Vmn2r122',
       'CAAA01147332.1'],
      dtype='object', name='index', length=31053)

In [10]:
RNA_data_2.var.index

Index(['Xkr4', 'Gm1992', 'Gm37381', 'Rp1', 'Sox17', 'Gm37323', 'Mrpl15',
       'Lypla1', 'Gm37988', 'Tcea1',
       ...
       'AC125149.1', 'AC125149.2', 'AC125149.4', 'AC234645.1', 'AC168977.2',
       'AC168977.1', 'AC149090.1', 'CAAA01118383.1', 'Vmn2r122',
       'CAAA01147332.1'],
      dtype='object', name='index', length=31053)

In [11]:
RNA_data_2.X

array([[11.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [30.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [12.,  1.,  0., ...,  0.,  0.,  0.],
       [ 7.,  0.,  0., ...,  0.,  0.,  0.],
       [16.,  1.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [None]:
# RNA_data_2_df = pd.DataFrame(RNA_data_2.X, columns = list(RNA_data_2.var.index))

# RNA_data_2_df.to_csv('new data/scRNA_count.csv', compression='gzip')
# RNA_data_2_df

### spatial data

In [3]:
import anndata
print(anndata.__version__)

0.8.0


In [4]:
Spatial_data_2 = sc.read_h5ad('xenium data/adata_msbrain_3rep_withclusters_only_nuclei.h5ad')
                              
                              

In [5]:
Spatial_data_2

AnnData object with n_obs × n_vars = 82941 × 284
    obs: 'ind', 'cell_id', 'x_centroid', 'y_centroid', 'transcript_counts', 'control_probe_counts', 'control_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'graph_clusters', 'kmeans2_clusters', 'kmeans3_clusters', 'kmeans4_clusters', 'kmeans5_clusters', 'kmeans6_clusters', 'kmeans7_clusters', 'kmeans8_clusters', 'kmeans9_clusters', 'kmeans10_clusters', 'replicate', 'n_counts', 'leiden_2_2', 'leiden_1_8', 'leiden_1_4', 'leiden_1_0', 'leiden_0_8', 'leiden_0_6', 'cell_code', 'expanded_class', 'expanded_initial_annotation', 'Class', 'initial_annotation', 'celltype_annotation', 'X', 'Y'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'expanded_class_colors', 'expanded_initial_annotation_colors', 'hvg', 'initial_annotation_colors', 'leiden', 'leiden_0_6_colors', 'log1p', 'neighbors', 'pca', 'umap', 'wilcoxon'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'raw'
    obsp: 'connectivities'

In [6]:
Spatial_data_2.obs

Unnamed: 0,ind,cell_id,x_centroid,y_centroid,transcript_counts,control_probe_counts,control_codeword_counts,total_counts,cell_area,nucleus_area,...,leiden_0_8,leiden_0_6,cell_code,expanded_class,expanded_initial_annotation,Class,initial_annotation,celltype_annotation,X,Y
0,1,1,821.466797,605.680298,94,0,0,69.0,148.789844,32.647969,...,6,6,1_1,Oligo,0_MFOL_NFOL,Oligo,6_Oligo_2,Oligo_2,3865.724170,2850.258800
1,2,2,823.761719,615.448608,60,0,0,38.0,118.715781,32.738281,...,0,0,2_1,Astro,10_Astro,Astro,0_Astro,Astro,3876.523797,2896.227297
2,3,3,828.706482,625.965942,38,0,0,40.0,35.176719,23.661875,...,6,6,3_1,Oligo,0_MFOL_NFOL,Oligo,6_Oligo_2,Oligo_2,3899.793259,2945.720609
3,4,4,817.234131,627.839111,162,0,0,130.0,236.663906,71.211406,...,13,13,4_1,Oligo,0_MFOL_NFOL,Microglia,13_Microglia,Microglia,3845.805752,2954.535517
4,5,5,829.293518,632.736938,67,0,0,62.0,71.798438,48.317188,...,18,17,5_1,Ependymal,21_Ependymal,Ependymal,17_Ependymal,Ependymal,3902.555781,2977.584104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85067,28642,28642,2563.504883,3637.863770,146,0,0,76.0,184.960000,22.442656,...,1,2,28642_3,Oligo,20_MOL_MFOL,Vascular,2_Endo_pericytes,Endo_pericytes,12063.546358,17119.350356
85068,28643,28643,2549.242188,3632.845459,265,0,0,114.0,207.267188,23.300625,...,9,8,28643_3,Excitatory,6_L6 CT CTX,Excitatory,8_L6 CT CTX,L6 CT CTX,11996.427825,17095.734789
85069,28644,28644,2536.471191,3638.641113,412,0,0,190.0,499.202344,48.136562,...,9,8,28644_3,Excitatory,6_L6 CT CTX,Excitatory,8_L6 CT CTX,L6 CT CTX,11936.329050,17123.008442
85070,28645,28645,2501.973145,3640.272949,167,0,0,124.0,146.441719,32.783437,...,9,8,28645_3,Excitatory,6_L6 CT CTX,Excitatory,8_L6 CT CTX,L6 CT CTX,11773.985381,17130.687666


In [7]:
locations_df = Spatial_data_2.obs[['x_centroid', 'y_centroid']]
print(locations_df)
locations_df.to_csv('new data/locations.csv')

        x_centroid   y_centroid
0       821.466797   605.680298
1       823.761719   615.448608
2       828.706482   625.965942
3       817.234131   627.839111
4       829.293518   632.736938
...            ...          ...
85067  2563.504883  3637.863770
85068  2549.242188  3632.845459
85069  2536.471191  3638.641113
85070  2501.973145  3640.272949
85071  2514.147217  3640.606689

[82941 rows x 2 columns]


FileNotFoundError: [Errno 2] No such file or directory: 'new data/locations.csv'

In [8]:
Spatial_data_2.var

Unnamed: 0_level_0,highly_variable,means,dispersions,dispersions_norm
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010300C02Rik,False,4.768083,5.878270,-1.078765
Acsbg1,True,4.298373,6.354266,0.965177
Acta2,True,3.051789,6.734463,2.216526
Acvrl1,True,3.534977,6.142235,0.924686
Adamts2,True,2.569652,5.434074,-0.321624
...,...,...,...,...
Vwc2l,True,2.556082,5.472213,-0.205541
Wfs1,False,3.575287,5.581934,-0.539640
Zfp366,True,2.443994,5.853837,0.956001
Zfp536,True,4.785142,6.399535,0.330082


In [26]:
Spatial_data_3 = Spatial_data_2
Spatial_data_3 = Spatial_data_3.var.drop(['means', 'dispersions_norm', 'dispersions'], axis=1)
# Spatial_data_3 = pd.DataFrame(Spatial_data_3.X.todense(), columns = list(Spatial_data_2.var.index))
Spatial_data_3[Spatial_data_3['highly_variable']==True]

Unnamed: 0_level_0,highly_variable
index,Unnamed: 1_level_1
Acsbg1,True
Acta2,True
Acvrl1,True
Adamts2,True
Adamtsl1,True
...,...
Vat1l,True
Vip,True
Vwc2l,True
Zfp366,True


In [None]:
Spatial_data_3 = Spatial_data_2
Spatial_data_3.var.drop(['highly_variable', 'dispersions_norm', 'dispersions'], axis=1)
Spatial_data_3 = pd.DataFrame(Spatial_data_3.X.todense(), columns = list(Spatial_data_2.var.index))
Spatial_data_3

In [None]:
Spatial_data_2.X.todense().shape

In [None]:
len(list(Spatial_data_2.var.index))

In [None]:
Spatial_data_2_df = pd.DataFrame(Spatial_data_2.X.todense(), columns = list(Spatial_data_2.var.index))
# Spatial_data_2_df.to_csv('new data/spatial_data.csv')
Spatial_data_2_df

In [28]:
Spatial_data_2_df['2010300C02Rik'][Spatial_data_2_df['2010300C02Rik'] != 0]

4        5.779449
5        4.394449
6        5.288491
9        5.226742
28       5.283602
           ...   
82927    4.730565
82935    4.443047
82937    4.485477
82938    3.982138
82939    5.492796
Name: 2010300C02Rik, Length: 32289, dtype: float32