In [None]:
from vpolo.alevin import parser
import scanpy as sc
import pandas as pd
import os 
from scipy.io import mmwrite
from scipy.sparse import csr_matrix
import shutil
import tempfile

In [None]:
#######
def alevin_to_scanpy(Dataset_path, protein_coding_file, universal_genes_file, metadata_file):
    with open(protein_coding_file, "r") as file:
        protein_coding_genes = [i for i in {line.strip() for line in file}]
    
    metadata_df = pd.read_excel(metadata_file)
    # loading the metadata file as a dataframe
    adata_dict = {} #empty dict to store adata objects 
    
    for subdir in os.listdir(Dataset_path):
        subdir_path = os.path.join(Dataset_path, subdir)
        if subdir.startswith('.'): #to handle hidden direcotries (like .ipynb_checkpoints) 
            continue
        if os.path.isdir(subdir_path):
            alevin_df = parser.read_quants_bin(subdir_path) 
            # print(alevin_df.shape)
            filtered_alevin_df = alevin_df.T.loc[alevin_df.T.index.isin(protein_coding_genes)] #filtering with only protein coding genes
            print(f"dimensions after filtering: {filtered_alevin_df.shape}")
            missing_genes = list(set(protein_coding_genes) - set(alevin_df.T.index)) #missing genes in df
            missing_data = pd.DataFrame(0, index=missing_genes, columns=alevin_df.T.columns) #making missing df with 0 values for all cells
            print(f"adding missing genes: {missing_data.shape}")
            filtered_alevin_df = pd.concat([filtered_alevin_df, missing_data]) #adding missing with 0 values
            filtered_alevin_df = filtered_alevin_df.loc[protein_coding_genes] #order sorting
            print(f"final dimension:{filtered_alevin_df.shape}")
            adata = sc.AnnData(filtered_alevin_df)

            # same till this part, changing the output appraoch
            # removes the directory as soon as it exits the "with" loop
            with tempfile.TemporaryDirectory() as tempdir:
                # matrix.mtx
                mmwrite(os.path.join(tempdir, "matrix.mtx"), csr_matrix(adata.X))

                # barcode file
                barcodes_df = pd.DataFrame(adata.var_names) #not adata.obs_names as we are using transpose matrix
                barcodes_df.to_csv(os.path.join(tempdir, "barcodes.tsv"), sep="\t", index=False, header=False)

                # genes file
                shutil.copy(universal_genes_file, os.path.join(tempdir, "genes.tsv"))

                # scanpy input
                adata = sc.read_10x_mtx(
                    tempdir,  # the directory with all three files
                    var_names="gene_symbols",  # use gene symbols for the variable names (variables-axis index)
                    cache=False,  # disabling the automatic cache file so we can create one mannually in the training folder.
                )
                adata.var_names_make_unique()
                
                # adding metadata
                
                dataset_label = [subdir]*len(adata.obs)
                adata.obs["Dataset"]=dataset_label
                
                age_label = metadata_df.loc[metadata_df["Accession"] == subdir, "AGE"].values[0]
                adata.obs["Age"]=age_label
                
                sex_label = metadata_df.loc[metadata_df["Accession"] == subdir, "SEX"].values[0]
                adata.obs["Sex"]=sex_label

                skin_area_label = metadata_df.loc[metadata_df["Accession"] == subdir, "SKIN AREA"].values[0]
                adata.obs["Skin area"]=skin_area_label

                ethnicity_label = metadata_df.loc[metadata_df["Accession"] == subdir, "ETHNICITY"].values[0]
                adata.obs["Ethnicity"]=ethnicity_label
                
                adata_dict[subdir] = adata
    
    # adata_dict["concatenated"] = sc.concat(list(adata_dict.values()), keys=adata_dict.keys(), axis=0, label="datasets")
    return adata_dict

In [None]:
Dataset_path = "/home/jovyan/ifbdata/spatial_cell_id/Kush/alignment/photoaging_extensor_side_alevin" #
protein_coding_file = "/home/jovyan/ifbdata/spatial_cell_id/Reference/txp2gene/protein_coding_genes_version.txt"
universal_genes_file = "/home/jovyan/ifbdata/spatial_cell_id/Reference/txp2gene/genes.tsv"
metadata_file = "/home/jovyan/ifbdata/spatial_cell_id/Reference/Clarins_datasets_metadata.xlsx"

photoaging_extensor_side = alevin_to_scanpy(Dataset_path, protein_coding_file, universal_genes_file, metadata_file) #
for key in photoaging_extensor_side.keys(): #
    publication_label = ['photoaging_extensor_side']*len(photoaging_extensor_side[key].obs) #
    photoaging_extensor_side[key].obs['Publication']=publication_label #

In [None]:
Concatenated_adata = sc.concat(list(photoaging_extensor_side.values()), axis=0) #

In [None]:
Concatenated_adata

In [None]:
Concatenated_adata = sc.concat(Concatenated_adata, list(output_sun_protected_human_skin_inguinoiliac.values()), axis=0)

In [None]:
Concatenated_adata