In [1]:
import pandas as pd
import scanpy as sc
import numpy as np


In [2]:
"""
@author: wen zhang
This function integrates two single-cell datasets, spatial and scRNA-seq, 
and predictes the expression of the spatially unmeasured genes from the scRNA-seq data.

Parameters
-------
RNA_file : str
    scRNA-seq data count file with Tab-delimited (cells X genes).
Spatial_file : str
    spatial count data file with Tab-delimited, please note that the file has no index.
location_file : str
    spatial spot coordinate file name with Tab-delimited, please note that the file has no index.
device : str
    Option,  ['CPU','GPU'], defaults to 'CPU'
train_gene : list
    genes for integrations, you can support more than one train list.
predict_gene : list
    genes for prediction, you can support more than one test list.
outdir : str
    result file stored direction    
"""

DataDir = 'DataUpload/Dataset4/'
outdir = 'FigureData/Figure2/Dataset4/'
RNA_file = DataDir + 'scRNA_count.txt'
Spatial_file = DataDir + 'Insitu_count.txt'
location_file = DataDir + 'Locations.txt'

RNA_data = pd.read_table(RNA_file, header=0, index_col = 0)
Spatial_data = pd.read_table(Spatial_file, sep = '\t',header = 0)
RNA_data_adata = sc.read(RNA_file, sep = '\t', first_column_names = True).T
Spatial_data_adata = sc.read(Spatial_file, sep = '\t')
locations = np.loadtxt(location_file, skiprows=1)

# train_gene = np.load(DataDir + 'train_list.npy', allow_pickle = True).tolist()
# predict_gene = np.load(DataDir + 'test_list.npy', allow_pickle = True).tolist()

# # device = 'GPU'

# if not os.path.exists(outdir):
#     os.mkdir(outdir)




In [3]:
RNA_data

Unnamed: 0,F1S4_160108_001_A01,F1S4_160108_001_B01,F1S4_160108_001_C01,F1S4_160108_001_D01,F1S4_160108_001_E01,F1S4_160108_001_F01,F1S4_160108_001_G01,F1S4_160108_001_H01,F1S4_160108_002_A01,F1S4_160108_002_B01,...,FYS4_171004_103_F01,FYS4_171004_103_G01,FYS4_171004_103_H01,FYS4_171004_104_A01,FYS4_171004_104_B01,FYS4_171004_104_C01,FYS4_171004_104_D01,FYS4_171004_104_F01,FYS4_171004_104_G01,FYS4_171004_104_H01
X0610005C13Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X0610006L08Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X0610007P14Rik,79,121,89,115,390,61,72,161,118,121,...,82,263,45,88,81,187,99,123,112,37
X0610009B22Rik,140,175,68,0,177,110,149,60,61,114,...,66,136,82,74,104,51,160,245,179,91
X0610009E02Rik,0,1,0,0,0,0,0,0,0,26,...,3,0,0,0,0,0,0,0,25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zzef1,36,1,104,0,0,0,17,6,56,0,...,0,0,0,0,11,0,16,0,54,0
Zzz3,0,29,21,30,9,88,8,14,0,113,...,30,0,0,26,0,70,0,93,0,70
a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
l7Rn6,190,178,153,110,335,178,131,128,41,258,...,121,287,113,94,147,133,271,75,282,0


In [21]:
Spatial_data

Unnamed: 0,Usp40,Pank2,Nrip2,Tmem245,Col6a1,Slc7a4,Adam19,Abcf1,Pip4k2c,Gm1966,...,Lynx1,Trp53i11,Ctgf,Mog,Slc17a7,Cldn11,Gsn,Cnp,Cplx1,Mag
0,2,0,1,0,2,1,1,0,1,5,...,0,0,0,1,19,0,0,0,7,1
1,0,0,0,0,2,0,0,1,0,2,...,1,1,0,1,6,2,0,2,4,1
2,0,0,0,0,0,1,0,1,1,2,...,0,4,2,0,20,0,0,0,7,1
3,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,1,2,31,1
4,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,13,0,0,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
519,0,0,0,0,0,0,2,1,2,0,...,3,0,2,0,15,0,0,2,13,0
520,0,0,0,0,0,0,2,1,1,2,...,12,0,2,0,35,0,0,5,36,1
521,0,0,1,1,2,0,1,1,0,2,...,13,1,0,0,26,0,0,2,28,1
522,0,1,0,0,0,0,0,1,3,0,...,2,1,0,1,31,4,0,3,15,2


In [48]:
locations.shape


(524, 2)

## new data

### RNA

In [None]:
RNA_data_2 = sc.read_h5ad('new data/Yao_150kcells_subsample_with_annotations.h5ad')




In [None]:
RNA_data_2.obs.columns

In [None]:
RNA_data_2.var.index

In [None]:
RNA_data_2.X

In [None]:
RNA_data_2_df = pd.DataFrame(RNA_data_2.X, columns = list(RNA_data_2.var.index))

RNA_data_2_df.to_csv('new data/scRNA_count.csv', compression='gzip')
RNA_data_2_df

### spatial data

In [19]:
import anndata
print(anndata.__version__)

0.8.0


In [20]:
Spatial_data_2 = sc.read_h5ad('new data/adata_msbrain_3rep_withclusters_only_nuclei_deep_annotation_with_domains_CORT_HIPP_raw_input_bench.h5ad')
                              
                              

In [21]:
Spatial_data_2

AnnData object with n_obs × n_vars = 49773 × 284
    obs: 'ind', 'cell_id', 'x_centroid', 'y_centroid', 'transcript_counts', 'control_probe_counts', 'control_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'graph_clusters', 'kmeans2_clusters', 'kmeans3_clusters', 'kmeans4_clusters', 'kmeans5_clusters', 'kmeans6_clusters', 'kmeans7_clusters', 'kmeans8_clusters', 'kmeans9_clusters', 'kmeans10_clusters', 'replicate', 'n_counts', 'leiden_2_2', 'leiden_1_8', 'leiden_1_4', 'leiden_1_0', 'leiden_0_8', 'leiden_0_6', 'cell_code', 'expanded_class', 'expanded_initial_annotation', 'Class', 'initial_annotation', 'celltype_annotation', 'X', 'Y', 'leiden_0_4', 'leiden_0_2', 'Class_old', 'class_number', 'region_annotation', 'region_level1'

In [22]:
Spatial_data_2.obs

Unnamed: 0,ind,cell_id,x_centroid,y_centroid,transcript_counts,control_probe_counts,control_codeword_counts,total_counts,cell_area,nucleus_area,...,initial_annotation,celltype_annotation,X,Y,leiden_0_4,leiden_0_2,Class_old,class_number,region_annotation,region_level1
7,8,8,18.432051,641.433838,146,0,0,63.0,347.928906,42.808125,...,Oligo,Oligo,86.739019,3018.510669,0,0,Oligo,0_Oligo,CP,CH
8,9,9,19.799652,653.951599,94,0,0,70.0,211.918281,31.293281,...,L6b CTX,L6b CTX,93.174787,3077.417751,3,7,Excitatory,7_Excitatory,CP,CH
10,11,11,6.713511,661.158325,94,0,0,30.0,438.963906,19.236563,...,Astro1,Astro1,31.592979,3111.331739,1,1,Astro,1_Astro,CP,CH
11,12,12,28.277199,671.846619,220,0,0,37.0,580.664219,18.378594,...,Vip INH,Vip INH,133.069104,3161.629566,4,5,Inhibitory,5_Inhibitory,CP,CH
12,13,13,9.217441,708.382568,105,0,0,43.0,645.553750,22.668437,...,Car4+ EC,Car4+ EC,43.376169,3333.563361,2,2,Vascular,2_Vascular,CP,CH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85067,28642,28642,2563.504883,3637.863770,146,0,0,76.0,184.960000,22.442656,...,Car4+ EC,Car4+ EC,12063.546358,17119.350356,2,2,Vascular,2_Vascular,L6b,CH
85068,28643,28643,2549.242188,3632.845459,265,0,0,114.0,207.267188,23.300625,...,L6 CT Syt6,L6 CT Syt6,11996.427825,17095.734789,3,7,Excitatory,7_Excitatory,L6b,CH
85069,28644,28644,2536.471191,3638.641113,412,0,0,190.0,499.202344,48.136562,...,L6b CTX,L6b CTX,11936.329050,17123.008442,3,7,Excitatory,7_Excitatory,L6b,CH
85070,28645,28645,2501.973145,3640.272949,167,0,0,124.0,146.441719,32.783437,...,L6 CT Syt6,L6 CT Syt6,11773.985381,17130.687666,3,7,Excitatory,7_Excitatory,L6a,CH


In [23]:
Spatial_data_2.obs['replicate']

7        1
8        1
10       1
11       1
12       1
        ..
85067    3
85068    3
85069    3
85070    3
85071    3
Name: replicate, Length: 49773, dtype: category
Categories (3, object): ['1', '2', '3']

In [31]:
Spatial_data_2.X.shape

(15359, 284)

In [30]:
Spatial_data_2 = Spatial_data_2[Spatial_data_2.obs['replicate']=='1']
Spatial_data_2

View of AnnData object with n_obs × n_vars = 15359 × 284
    obs: 'ind', 'cell_id', 'x_centroid', 'y_centroid', 'transcript_counts', 'control_probe_counts', 'control_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'graph_clusters', 'kmeans2_clusters', 'kmeans3_clusters', 'kmeans4_clusters', 'kmeans5_clusters', 'kmeans6_clusters', 'kmeans7_clusters', 'kmeans8_clusters', 'kmeans9_clusters', 'kmeans10_clusters', 'replicate', 'n_counts', 'leiden_2_2', 'leiden_1_8', 'leiden_1_4', 'leiden_1_0', 'leiden_0_8', 'leiden_0_6', 'cell_code', 'expanded_class', 'expanded_initial_annotation', 'Class', 'initial_annotation', 'celltype_annotation', 'X', 'Y', 'leiden_0_4', 'leiden_0_2', 'Class_old', 'class_number', 'region_annotation', 'region_level1'

In [32]:
Spatial_data_2.X.shape

(15359, 284)

In [40]:
Spatial_data_2.obs['cell_code']

7            8_1
8            9_1
10          11_1
11          12_1
12          13_1
          ...   
26357    26368_1
26358    26369_1
26359    26370_1
26360    26371_1
26361    26372_1
Name: cell_code, Length: 15359, dtype: object

In [34]:
locations_df = Spatial_data_2.obs[['cell_code','x_centroid', 'y_centroid']]
print(locations_df)
locations_df.to_csv('new data/locations.csv')

      cell_code   x_centroid   y_centroid
7           8_1    18.432051   641.433838
8           9_1    19.799652   653.951599
10         11_1     6.713511   661.158325
11         12_1    28.277199   671.846619
12         13_1     9.217441   708.382568
...         ...          ...          ...
26357   26368_1  2504.661377  3565.848145
26358   26369_1  2523.025391  3568.661865
26359   26370_1  2537.839355  3621.497070
26360   26371_1  2526.644531  3624.125000
26361   26372_1  2540.800537  3422.432861

[15359 rows x 3 columns]


In [35]:
Spatial_data_2.var

2010300C02Rik
Acsbg1
Acta2
Acvrl1
Adamts2
...
Vwc2l
Wfs1
Zfp366
Zfp536
Zfpm2


In [12]:
# Spatial_data_2.var.drop(['highly_variable', 'dispersions_norm', 'dispersions'], axis=1)

In [36]:
# Spatial_data_2.X.todense().shape
Spatial_data_2.X.shape

(15359, 284)

In [37]:
len(list(Spatial_data_2.var.index))

284

In [41]:
# Spatial_data_2_df = pd.DataFrame(Spatial_data_2.X.todense(), columns = list(Spatial_data_2.var.index))

Spatial_data_2_df = pd.DataFrame(Spatial_data_2.X, columns = list(Spatial_data_2.var.index), index=list(Spatial_data_2.obs['cell_code']))

Spatial_data_2_df.to_csv('new data/spatial_data.csv')
Spatial_data_2_df

Unnamed: 0,2010300C02Rik,Acsbg1,Acta2,Acvrl1,Adamts2,Adamtsl1,Adgrl4,Aldh1a2,Aldh1l1,Angpt1,...,Trpc4,Tubb2a,Unc13c,Vat1l,Vip,Vwc2l,Wfs1,Zfp366,Zfp536,Zfpm2
8_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
9_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
11_1,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
13_1,0.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26368_1,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26369_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26370_1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
26371_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
