In [1]:
from vpolo.alevin import parser
import scanpy as sc
import pandas as pd
import os 
from scipy.io import mmwrite
from scipy.sparse import csr_matrix
import tempfile

In [30]:
def alevin_to_scanpy(Dataset_path):

    if os.path.isdir(Dataset_path):
        alevin_df = parser.read_quants_bin(Dataset_path).T
        #alevin_df = alevin_df.T
        print(alevin_df.shape)
        adata = sc.AnnData(alevin_df)
        print(adata)
        with tempfile.TemporaryDirectory() as tempdir:
            # matrix.mtx
            mmwrite(os.path.join(tempdir, "matrix.mtx"), csr_matrix(adata.X))
            print(adata.X)
            
            # barcode file
            barcodes_df = pd.DataFrame(adata.var_names) #not adata.obs_names as we are using transpose matrix
            barcodes_df.to_csv(os.path.join(tempdir, "barcodes.tsv"), sep="\t", index=False, header=False)
            print(adata.var_names)
            
            #adata.obs_names = adata.obs_names.str.split('.').str[0]
            # genes file
            adata.obs_names = adata.obs_names.str.split('.').str[0]
            genes_df = pd.DataFrame(adata.obs_names)
            genes_df.to_csv(os.path.join(tempdir, "genes.tsv"), sep="\t", index=False, header=False)
            print(adata.obs_names)
            
            # scanpy input
            adata = sc.read_10x_mtx(
                tempdir,  # the directory with all three files
                var_names="gene_ids",  # use gene symbols for the variable names (variables-axis index)
                cache=False,  # disabling the automatic cache file so we can create one mannually in the training folder.
            )
            adata.var_names_make_unique()
        return adata

In [31]:
Dataset_path = "/home/jovyan/ifbdata/spatial_cell_id/Kush/alignment/demo_matrix/align/SRR9036396"

output = alevin_to_scanpy(Dataset_path)

/home/jovyan/ifbdata/spatial_cell_id/Kush/alignment/demo_matrix/align/SRR9036396/alevin
Using rust mode with 3281 rows and 78277 columns
(78277, 3281)
AnnData object with n_obs × n_vars = 78277 × 3281
[[  0.           0.           0.         ...   0.           0.
    0.        ]
 [  0.           0.           0.         ...   0.           0.
    0.        ]
 [  0.           1.5          0.         ...   0.33333334   0.
    0.        ]
 ...
 [162.         189.          55.         ...   8.           4.
    2.        ]
 [  0.           0.           0.         ...   0.           0.
    0.        ]
 [  0.           0.           0.         ...   0.           0.
    0.        ]]
Index(['ENSG00000290825.2', 'ENSG00000223972.6', 'ENSG00000310526.1',
       'ENSG00000227232.6', 'ENSG00000278267.1', 'ENSG00000243485.6',
       'ENSG00000284332.1', 'ENSG00000237613.3', 'ENSG00000308361.1',
       'ENSG00000290826.2',
       ...
       'ENSG00000198886.2', 'ENSG00000210176.1', 'ENSG00000210184.1',


ValueError: Length of passed value for var_names is 3281, but this AnnData has shape: (3281, 78277)

In [6]:
alevin_df = parser.read_quants_bin(Dataset_path).T
print(alevin_df.shape)
adata = sc.AnnData(alevin_df)

/home/jovyan/ifbdata/spatial_cell_id/Kush/alignment/demo_matrix/align/SRR9036396/alevin
Using rust mode with 3281 rows and 78277 columns
(78277, 3281)


In [22]:
adata.obs_names

Index(['ENSG00000290825', 'ENSG00000223972', 'ENSG00000310526',
       'ENSG00000227232', 'ENSG00000278267', 'ENSG00000243485',
       'ENSG00000284332', 'ENSG00000237613', 'ENSG00000308361',
       'ENSG00000290826',
       ...
       'ENSG00000198886', 'ENSG00000210176', 'ENSG00000210184',
       'ENSG00000210191', 'ENSG00000198786', 'ENSG00000198695',
       'ENSG00000210194', 'ENSG00000198727', 'ENSG00000210195',
       'ENSG00000210196'],
      dtype='object', length=78277)

In [21]:
adata.obs_names = adata.obs_names.str.split('.').str[0]