In [None]:
import os
import pandas
import numpy
from scipy import sparse

from aavomics import database

import anndata

In [None]:
REFERENCE_NAME = "20200331_Allen_Cortex_Hippocampus_10X_v3"

CHUNK_SIZE = 10000

In [None]:
reference_path = os.path.join(database.DATA_PATH, "reference_databases", REFERENCE_NAME)

In [None]:
metadata_df = pandas.read_csv(os.path.join(reference_path, "metadata.csv"), index_col=0, header=0)

In [None]:
cell_ids = []

with open(os.path.join(reference_path, "matrix.csv")) as reference_csv_file:
    
    header_row = reference_csv_file.readline()
    gene_list = header_row.strip().split(",")[1:]
    num_genes = len(gene_list)
    
    sparse_matrix = sparse.csr_matrix((0, num_genes), dtype=numpy.uint16)

    row = reference_csv_file.readline()

    row_index = 0
    temp_values = []
    
    while row:
        
        row_values = row.split(",")
        cell_ids.append(row_values[0])
        
        row_values = [int(x) for x in row_values[1:]]
        temp_values.append(row_values)

        row_index += 1
        
        if row_index % CHUNK_SIZE == 0:
            row_batch = sparse.csr_matrix(temp_values,dtype=numpy.uint16)
            sparse_matrix = sparse.vstack([sparse_matrix, row_batch])
            temp_values = []
            print(row_index)

        row = reference_csv_file.readline()

    row_batch = sparse.csr_matrix(temp_values,dtype=numpy.uint16)
    sparse_matrix = sparse.vstack([sparse_matrix, row_batch])

In [None]:
genes_df = pandas.DataFrame(index=gene_list, dtype=numpy.object)
metadata_df = metadata_df.loc[cell_ids]

adata = anndata.AnnData(X=sparse_matrix, var=genes_df, obs=metadata_df)
adata.write(os.path.join(reference_path, "barcode_transcript_counts.h5ad"))