In [None]:
import os
import shutil
import tarfile
from os.path import expanduser
import pandas

In [None]:
# Choose the location of your data directory
os.environ["AAVOMICS_PATH"] = os.path.expanduser("~/aavomics_data")

In [None]:
import aavomics

from aavomics import database
from pepars.fileio import fileio

# Download the database files

In [None]:
FILES_TO_DOWNLOAD = {
    "database": {
        "Alignments.csv": "https://data.caltech.edu/tindfiles/serve/6701d6ef-ee8b-4316-8f79-5533a011ac1d/",
        "Animals.csv": "https://data.caltech.edu/tindfiles/serve/55be5772-fbd4-4852-b7f7-4855181c4fd4/",
        "Cell Sets.csv": "https://data.caltech.edu/tindfiles/serve/15ec4be7-6e77-43d7-8d3d-c8a219241ef6/",
        "Dissociation Runs.csv": "https://data.caltech.edu/tindfiles/serve/100feb00-6a76-4139-99ba-6d3bcd3e5199/",
        "Injections.csv": "https://data.caltech.edu/tindfiles/serve/af750209-e6c5-4f6a-a508-d1c7a10ea99c/",
        "Read Sets.csv": "https://data.caltech.edu/tindfiles/serve/de1103d2-7221-47fb-82fa-d05579e6a8e8/",
        "References.csv": "https://data.caltech.edu/tindfiles/serve/5232dd34-c856-4206-b7b0-626bc4dcc481/",
        "Sequencing Libraries.csv": "https://data.caltech.edu/tindfiles/serve/f835967f-89f2-4274-a7e6-20439e8fafc1/",
        "Sequencing Runs.csv": "https://data.caltech.edu/tindfiles/serve/a402f196-8f74-4941-bc07-8c8e5c0b0256/",
        "Templates.csv": "https://data.caltech.edu/tindfiles/serve/36dec17b-c9af-4805-9dad-bff890b6c49d/",
        "Tissue Samples.csv": "https://data.caltech.edu/tindfiles/serve/012a00a3-f1df-46c2-b422-5780b621d548/",
        "Vector Pools.csv": "https://data.caltech.edu/tindfiles/serve/e5796184-df3a-42d5-b9aa-2f01b54308ee/",
        "Vectors.csv": "https://data.caltech.edu/tindfiles/serve/86ecec8f-9cc7-4835-abc3-e42a93e41f59/",
        "Viruses.csv": "https://data.caltech.edu/tindfiles/serve/0b188950-2b76-4c41-8427-2d7282bd1a57/"
    },
    ".": {
        "CCN202105070_marker_gene_clusters.csv":  "https://data.caltech.edu/tindfiles/serve/89a0a7e8-9a22-43a6-9c97-5569556fc36e/",
        "aavomics_cell_type_transduction_rates.csv":  "https://data.caltech.edu/tindfiles/serve/2fe89158-45c3-4a6b-b5e3-253b4ed02695/",
        "aavomics_marker_gene_transduction_rates.csv":  "https://data.caltech.edu/tindfiles/serve/95c885e4-3ac9-4dbf-a950-f55654d33526/",
        "aavomics_SRA_file_metadata.csv":  "https://data.caltech.edu/tindfiles/serve/0a9efb98-49d1-4504-bc21-96b3b5efb170/",
        "aavomics_SRA_metadata.csv":  "https://data.caltech.edu/tindfiles/serve/e10f3886-9550-4d85-aba6-f1e81fb86bd9/",
        "CCN202105070_aavomics_cluster_transduction_rates.csv":  "https://data.caltech.edu/tindfiles/serve/5037b46a-89a9-476f-b56e-56faa5281ba5/",
        "aavomics_sample_metadata.csv":  "https://data.caltech.edu/tindfiles/serve/80437b7c-bbf3-43ab-887b-e4193061eeb7/",
        "CCN202105070_gene_clusters.csv":  "https://data.caltech.edu/tindfiles/serve/a76ab506-0e44-4e70-9fc5-502186880c0d/",
        "aavomics_immune_study_DE_3_vs_25_DPI.xlsx":  "https://data.caltech.edu/tindfiles/serve/a56e671f-5f5e-4ea3-8ebf-f9d567975f7e/",
        "aavomics_cell_metadata.csv":  "https://data.caltech.edu/tindfiles/serve/0196c57d-e00e-453a-88ac-d7ab5b46f22f/",
        "aavomics_mouse_cortex_2021_droplet_training_data.h5ad":  "https://data.caltech.edu/tindfiles/serve/e7563f14-af07-4bb2-a4fe-11784bca8006/",
        "aavomics_mouse_cortex_2021.h5ad":  "https://data.caltech.edu/tindfiles/serve/1df614db-0d72-4d41-8c44-581043793349/"
    }
}

In [None]:
for file_path in FILES_TO_DOWNLOAD:
    
    for file_name, file_URL in FILES_TO_DOWNLOAD[file_path].items():
        
        download_path = os.path.join(database.DATA_PATH, file_path, file_name)
        
        fileio.download_remote_file(file_URL, download_path, skip_if_exists=True)

In [None]:
database.load_database()

# Download preprocessed data

In [None]:
CELLRANGER_REFERENCE = "refdata-gex-mm10-2020-A"
CELLRANGER_REFERENCE_URL = "https://cf.10xgenomics.com/supp/cell-exp/" + CELLRANGER_REFERENCE + ".tar.gz"
CELLRANGER_DOWNLOAD_PATH = os.path.join(database.DATA_PATH, "references")

# Download references

In [None]:
# TODO: Add raw FASTQ files

FILES_TO_DOWNLOAD = {
    "references": {
        "refdata-gex-mm10-2020-A.tar.gz": "https://data.caltech.edu/tindfiles/serve/dafa7bec-a84b-4dfe-a965-ba82abe4f3d5/"
    },
    os.path.join("reference_databases", "20200331_Allen_Cortex_Hippocampus_10X_v3"): {
        "metadata.csv": "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hip_10x/metadata.csv",
        "matrix.csv": "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hip_10x/matrix.csv"
    },
    os.path.join("references", "mm10-allen-premRNA", "fasta"): {
        "genome.fa.gz": "ftp://ftp.ensembl.org/pub/release-84/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz"
    },
    os.path.join("references", "mm10-allen-premRNA", "genes"): {
        "genes.gtf.gz": "https://data.caltech.edu/tindfiles/serve/66e6df77-2766-45fb-b113-9b3a83285efc/"
    }
}

In [None]:
for file_path in FILES_TO_DOWNLOAD:
    
    for file_name, file_URL in FILES_TO_DOWNLOAD[file_path].items():
        
        download_path = os.path.join(database.DATA_PATH, file_path, file_name)
        
        fileio.download_remote_file(file_URL, download_path, skip_if_exists=True)
        
        if download_path.endswith(".tar.gz"):
            tar = tarfile.open(download_path)
            tar.extractall(download_path[0:-7])
            tar.close()
            os.remove(download_path)
        elif download_path.endswith(".gz"):
            with gzip.open(download_path, "rb") as compressed_file:
                with open(download_path[0:-3], "wb") as uncompressed_file:
                    shutil.copyfileobj(compressed_file, uncompressed_file)

            os.remove(download_path)

# Download raw data

Raw FASTQ files can be downloaded from the SRA, here: https://www.ncbi.nlm.nih.gov/bioproject/758711

Each transcriptome FASTQ file should be in a folder within the database.DATA_PATH directory with the following structure:
```cell_sets\{CELL_SET_NAME}\transcriptome\reads```, where {CELL_SET_NAME} corresponds to the full dated name of the cell set, as in database.CELL_SETS_DICT.

Each amplifified FASTQ file should be in a folder within the database.DATA_PATH directory with the following structure:
```cell_sets\{CELL_SET_NAME}\virus\reads```, where {CELL_SET_NAME} corresponds to the full dated name of the cell set, as in database.CELL_SETS_DICT.