## Convert spatial data (needs updated scanpy)

### scRNAseq

In [22]:
import pandas as pd
import scanpy as sc
import numpy as np

In [23]:
RNA_data_2 = sc.read_h5ad('newData/WMB-10Xv2-subset02_ready_for_xenium_benchmark.h5ad')
sc.pp.subsample(RNA_data_2, fraction=0.05)
sc.pp.filter_genes(RNA_data_2, min_cells=1)
RNA_data_2 = RNA_data_2[:, ~RNA_data_2.var['gene_symbol'].duplicated(keep='first')]

In [24]:
RNA_data_2.obs.columns

Index(['cell_barcode', 'library_label_x', 'anatomical_division_label_x',
       'library_label_y', 'anatomical_division_label_y', 'cluster_alias',
       'library_method', 'region_of_interest_acronym', 'donor_label',
       'donor_genotype', 'donor_sex', 'dataset_label', 'matrix_label', 'x',
       'y', 'neurotransmitter', 'division', 'class', 'subclass', 'supertype',
       'cluster', 'neurotransmitter_color', 'division_color', 'class_color',
       'subclass_color', 'supertype_color', 'cluster_color',
       'region_of_interest_order', 'region_of_interest_color'],
      dtype='object')

In [25]:
RNA_data_2.var.index

Index(['ENSMUSG00000051951', 'ENSMUSG00000089699', 'ENSMUSG00000102331',
       'ENSMUSG00000102343', 'ENSMUSG00000025900', 'ENSMUSG00000025902',
       'ENSMUSG00000033845', 'ENSMUSG00000025903', 'ENSMUSG00000033813',
       'ENSMUSG00000002459',
       ...
       'ENSMUSG00000096808', 'ENSMUSG00000051412', 'ENSMUSG00000061654',
       'ENSMUSG00000079834', 'ENSMUSG00000096506', 'ENSMUSG00000095552',
       'ENSMUSG00000094350', 'ENSMUSG00000096237', 'ENSMUSG00000095742',
       'ENSMUSG00000095041'],
      dtype='object', name='gene_identifier', length=26474)

In [26]:
RNA_data_2

View of AnnData object with n_obs × n_vars = 13785 × 26474
    obs: 'cell_barcode', 'library_label_x', 'anatomical_division_label_x', 'library_label_y', 'anatomical_division_label_y', 'cluster_alias', 'library_method', 'region_of_interest_acronym', 'donor_label', 'donor_genotype', 'donor_sex', 'dataset_label', 'matrix_label', 'x', 'y', 'neurotransmitter', 'division', 'class', 'subclass', 'supertype', 'cluster', 'neurotransmitter_color', 'division_color', 'class_color', 'subclass_color', 'supertype_color', 'cluster_color', 'region_of_interest_order', 'region_of_interest_color'
    var: 'gene_symbol', 'name', 'comment', 'n_cells'

In [27]:
pd.DataFrame(RNA_data_2.X.todense(), columns = list(RNA_data_2.var['gene_symbol']), index = list(RNA_data_2.obs.index)).T.to_csv('newData/scRNA_count.txt', sep='\t', float_format='%.0f')

In [30]:
RNA_data_2.write_h5ad(filename='newData/WMB-10Xv2-subset02_ready_for_xenium_benchmark_subset.h5ad')

### Xenium

In [31]:
import pandas as pd
import scanpy as sc
import numpy as np
import anndata
print(anndata.__version__)

0.9.2


In [32]:
Spatial_data_2 = sc.read_h5ad('newData/ms_brain_multisection1.h5ad')
sc.pp.subsample(Spatial_data_2, fraction=0.01)

In [33]:
Spatial_data_2.obs

Unnamed: 0,cell_id,x_centroid,y_centroid,transcript_counts,control_probe_counts,control_codeword_counts,total_counts,cell_area,nucleus_area
90832,90833,5823.691992,2712.051379,277,0,0,277,512.658906,48.046250
158997,158998,7517.363477,2256.782166,350,0,0,350,230.387188,69.360000
635,636,1653.728540,5274.607764,76,1,0,77,32.964063,15.895000
61504,61505,7746.951465,5547.483496,486,1,0,487,599.584687,41.182500
84824,84825,9201.830225,2024.420679,474,0,0,475,291.257812,77.442969
...,...,...,...,...,...,...,...,...,...
32936,32937,4468.304736,4129.319507,384,0,0,384,235.218906,77.939688
120787,120788,9241.787695,4487.567944,57,0,0,57,28.674219,11.469687
80981,80982,6382.961694,2208.255591,220,1,0,221,245.424219,40.685781
158489,158490,7464.743994,2402.174084,360,0,0,360,201.577500,63.309063


In [34]:
locations_df = Spatial_data_2.obs[['x_centroid', 'y_centroid']]
locations_df.columns = ['X', 'Y']
print(locations_df)
locations_df.to_csv('newData/Locations.txt', sep='\t', index=False)

                  X            Y
90832   5823.691992  2712.051379
158997  7517.363477  2256.782166
635     1653.728540  5274.607764
61504   7746.951465  5547.483496
84824   9201.830225  2024.420679
...             ...          ...
32936   4468.304736  4129.319507
120787  9241.787695  4487.567944
80981   6382.961694  2208.255591
158489  7464.743994  2402.174084
54619   3815.010010  2824.826208

[1620 rows x 2 columns]


In [35]:
Spatial_data_2.var['in_panel'].value_counts()

gene                248
negative_control     27
Name: in_panel, dtype: int64

In [37]:
Spatial_data_2 = Spatial_data_2[:, Spatial_data_2.var['in_panel']=='gene']

In [38]:
pd.DataFrame(Spatial_data_2.X.todense(), columns = list(Spatial_data_2.var['gene_id'])).to_csv('newData/Insitu_count.txt', sep='\t', index=False, float_format='%.0f')

In [39]:
Spatial_data_2.write_h5ad(filename='newData/ms_brain_multisection1_subset.h5ad')

### Gene List

In [41]:
import random

# Your original list
original_list = list(Spatial_data_2.var['gene_id'])

# Shuffle the original list randomly
random.shuffle(original_list)

# Specify the number of splits
num_splits = 10

# Calculate the size of each sublist
sublist_size = len(original_list) // num_splits
remainder = len(original_list) % num_splits

# Slice the list into sublists
split_lists = [original_list[i * sublist_size + min(i, remainder):(i + 1) * sublist_size + min(i + 1, remainder)] for i in range(num_splits)]

# Generate all_test_lists
all_test_lists = []
for sublist in split_lists:
    all_test_lists.append(sublist)

# Generate all_train_lists
all_train_lists = []
for i, test_list in enumerate(all_test_lists):
    train_list = [item for item in original_list if item not in test_list]
    all_train_lists.append(train_list)

# Print the train_list and test_list for each iteration
for i, (train_list, test_list) in enumerate(zip(all_train_lists, all_test_lists)):
    print(f"Iteration {i + 1} - Train List: {train_list}, Test List: {test_list}")

# Save all training and testing results to the corresponding .npy files
np.save('newData/train_list.npy', all_train_lists)
np.save('newData/test_list.npy', all_test_lists)

Iteration 1 - Train List: ['Fezf2', 'Adgrl4', 'Cdh20', 'Cntn6', 'Kcnh5', 'Siglech', 'Pglyrp1', 'Epha4', 'Nell1', 'Kctd12', 'Gad2', 'Prox1', 'Hpcal1', 'Cort', '2010300C02Rik', 'Dner', 'Fos', 'Gad1', 'Nts', 'Garnl3', 'Slit2', 'Rasgrf2', 'Igfbp4', 'Lyz2', 'Myl4', 'Sox11', 'Sipa1l3', 'Sla', 'Vat1l', 'Vip', 'Cd24a', 'Mapk4', 'Foxp2', 'Pdzrn3', 'Kctd8', 'Slc44a5', 'Tmem163', 'Trp73', 'Gm19410', 'Pecam1', 'Aldh1a2', 'Sema5b', 'Tox', 'Nostrin', 'Cplx3', 'Arc', 'Chodl', 'Gfra2', 'Spi1', 'Pln', 'Npy2r', 'Col6a1', 'Meis2', 'Hapln1', 'Rims3', 'Rorb', 'Pkib', 'Cpne8', 'Rasl10a', 'Trpc4', 'Trbc2', 'Slc17a7', 'Eya4', 'Syndig1', 'Fibcd1', 'Ntsr2', 'Parm1', 'Syt6', 'Shisa6', 'Aqp4', 'Rprm', 'Gsg1l', 'Sox17', 'Mecom', 'Gadd45a', 'Gjb2', 'Cdh6', 'Igf2', 'Igsf21', 'Nrep', 'Cwh43', 'Slc6a3', 'Prph', 'Syt2', 'Emcn', 'Plekha2', 'Col1a1', 'Satb2', 'Cbln1', 'Prss35', 'Spp1', 'Cbln4', 'Sorcs3', 'Rfx4', 'Sdk2', 'Ndst4', 'Thsd7a', 'Arhgap12', 'Plch1', 'Pdgfra', 'Gfap', 'Cd53', 'Th', 'Laptm5', 'Car4', 'Myo16', 'Gn

In [43]:
import numpy as np
import pandas as pd

# Load the 'train_list.npy' file
data = np.load('newData/train_list.npy', allow_pickle=True)

# Convert the NumPy array to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame as a CSV file
df.to_csv('newData/train_list.csv', index=False)

# Load the 'test_list.npy' file
data = np.load('newData/test_list.npy', allow_pickle=True)

# Convert the NumPy array to a DataFrame and save it as a CSV file
pd.DataFrame(data).to_csv('newData/test_list.csv', index=False)